import numpy as np
import pandas as pd 
import seaborn as sns 
sns.set(style = 'ticks')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker

import os 
for dirname, _, filenames in os.walk(''):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import sklearn 
import scipy 
import warnings 
warnings.filterwarnings("ignore", category = DeprecationWarning)
pd.set_option('display.max_columns', 100)
%matplotlib inline

def format_spines(ax, right_border=True):
    
    ax.spines['bottom'].set_color('#666666')
    ax.spines['left'].set_color('#666666')
    ax.spines['top'].set_visible(False)
    if right_border:
        ax.spines['right'].set_color('#FFFFFF')
    else:
        ax.spines['right'].set_color('#FFFFFF')
    ax.patch.set_facecolor('#FFFFFF')

def count_plot(feature, df, colors='Greens_d', hue=False, ax=None, title=''):
    
    # Preparing variables
    ncount = len(df)
    if hue != False:
        ax = sns.countplot(x=feature, data=df, palette=colors, hue=hue, ax=ax)
    else:
        ax = sns.countplot(x=feature, data=df, palette=colors, ax=ax)
        
    format_spines(ax)
    
    # Setting percentage
    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
#        ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
#                ha='center', va='bottom') # set the alignment of the text
        ax.annotate(y, (x.mean(), y), 
                ha='center', va='bottom') # set the alignment of the text
    
    # Final configuration
    if not hue:
        ax.set_title(df[feature].describe().name + ' Analysis', size=13, pad=15)
    else:
        ax.set_title(df[feature].describe().name + ' Analysis by ' + hue, size=13, pad=15)  
    if title != '':
        ax.set_title(title)       
    plt.tight_layout()
    
    
def bar_plot(x, y, df, colors='Blues_d', hue=False, ax=None, value=False, title=''):
    
    # Preparing variables
    try:
        ncount = sum(df[y])
    except:
        ncount = sum(df[x])
    #fig, ax = plt.subplots()
    if hue != False:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, hue=hue, ax=ax, ci=None)
    else:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, ax=ax, ci=None)

    # Setting borders
    format_spines(ax)

    # Setting percentage
    for p in ax.patches:
        xp=p.get_bbox().get_points()[:,0]
        yp=p.get_bbox().get_points()[1,1]
        if value:
            ax.annotate('{:.2f}k'.format(yp/1000), (xp.mean(), yp), 
                    ha='center', va='bottom') # set the alignment of the text
        else:
            ax.annotate('{:.1f}%'.format(100.*yp/ncount), (xp.mean(), yp), 
                    ha='center', va='bottom') # set the alignment of the text
    if not hue:
        ax.set_title(df[x].describe().name + ' Analysis', size=12, pad=15)
    else:
        ax.set_title(df[x].describe().name + ' Analysis by ' + hue, size=12, pad=15)
    if title != '':
        ax.set_title(title)  
    plt.tight_layout()
    
    
def categorical_plot(cols_cat, axs, df):
    
    idx_row = 0
    for col in cols_cat:
        # Returning column index
        idx_col = cols_cat.index(col)

        # Verifying brake line in figure (second row)
        if idx_col >= 3:
            idx_col -= 3
            idx_row = 1

        # Plot params
        names = df[col].value_counts().index
        heights = df[col].value_counts().values

        # Bar chart
        axs[idx_row, idx_col].bar(names, heights, color='navy')
        if (idx_row, idx_col) == (0, 2):
            y_pos = np.arange(len(names))
            axs[idx_row, idx_col].tick_params(axis='x', labelrotation=30)
        if (idx_row, idx_col) == (1, 1):
            y_pos = np.arange(len(names))
            axs[idx_row, idx_col].tick_params(axis='x', labelrotation=90)

        total = df[col].value_counts().sum()
        axs[idx_row, idx_col].patch.set_facecolor('#FFFFFF')
        format_spines(axs[idx_row, idx_col], right_border=False)
        for p in axs[idx_row, idx_col].patches:
            w, h = p.get_width(), p.get_height()
            x, y = p.get_xy()
            axs[idx_row, idx_col].annotate('{:.1%}'.format(h/1000), (p.get_x()+.29*w,
                                            p.get_y()+h+20), color='k')

        # Plot configuration
        axs[idx_row, idx_col].set_title(col, size=12)
        axs[idx_row, idx_col].set_ylim(0, heights.max()+120)

#Load data

order = pd.read_csv("olist_orders_dataset.csv")
customer = pd.read_csv("olist_order_customer_dataset.csv")
review = pd.read_csv("olist_order_reviews_dataset.csv")
payment = pd.read_csv("olist_order_payments_dataset.csv")
order_item = pd.read_csv("olist_order_items_dataset.csv")
product = pd.read_csv("olist_products_dataset.csv")
seller = pd.read_csv('olist_sellers_dataset.csv')
geo = pd.read_csv('olist_geolocation_dataset.csv')
translation = pd.read_csv('product_category_name_translation.csv')

# Data Pre-processing 
# Translate Product Category

#Convert Timestamp Data

order_date = ['order_purchase_timestamp', u'order_approved_at', 
              u'order_delivered_carrier_date', u'order_delivered_customer_date', 
              u'order_delivered_customer_date', u'order_estimated_delivery_date']


for items in order_date:
    order[items] = pd.to_datetime(order[items], format = '%Y-%m-%d %H:%M:%S')

# creating master dataframe 

payment.head()
print(payment.shape)
df1 = payment.merge(order_item, on = 'order_id')
print(df1.shape)
df2 = df1.merge(product, on = 'product_id')
print(df2.shape)
df3 = df2.merge(seller, on = 'seller_id')
print(df3.shape)
df4 = df3.merge(review, on = 'order_id')
print(df4.shape)
df5 = df4.merge(order, on ='order_id')
print(df5.shape)
df6 = df5.merge(translation, on = 'product_category_name')
print(df6.shape)
df = df6.merge(customer, on = 'customer_id')
print(df.shape)

(103886, 5)
(117601, 11)
(117601, 19)
(117601, 22)
(118315, 28)
(118315, 35)
(116581, 36)
(116581, 40)

# cleaning up and re-engineering some columns

df['order_purchase_year'] = df.order_purchase_timestamp.apply(lambda x: x.year)
df['order_purchase_month'] = df.order_purchase_timestamp.apply(lambda x: x.month)
df['order_purchase_dayofweek'] = df.order_purchase_timestamp.apply(lambda x: x.dayofweek)
df['order_purchase_hour'] = df.order_purchase_timestamp.apply(lambda x: x.hour)
df['order_purchase_day'] = df['order_purchase_dayofweek'].map({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4 : 'Fri', 5: 'Sat', 6:'Sun'})
df['order_purchase_mon'] = df['order_purchase_month'].map({1 : 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                                                           7:'Jul', 8: 'Aug', 9: 'Sep',10: 'Oct', 11: 'Nov', 12: 'Dec'})
df['order_count'] = 1
df['year_month'] = df['order_purchase_timestamp'].dt.strftime('%Y-%m')

df['ship_duration'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp'])/24
df['ship_duration'] = df['ship_duration'].astype('timedelta64[h]')

df['tocarrier_duration'] = (df['order_delivered_carrier_date'] - df['order_purchase_timestamp'])/24
df['tocarrier_duration'] = df['tocarrier_duration'].astype('timedelta64[h]')

df['lastmile_duration'] = (df['order_delivered_customer_date'] - df['order_delivered_carrier_date'])/24
df['lastmile_duration'] = df['lastmile_duration'].astype('timedelta64[h]')

df['expected_vs_shipdate'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date'])/24
df['expected_vs_shipdate'] = df['expected_vs_shipdate'].astype('timedelta64[h]')

df['expected_duration'] = (df['order_estimated_delivery_date'] - df['order_purchase_timestamp'])/24
df['expected_duration'] = df['expected_duration'].astype('timedelta64[h]')

df.head(3)

Time Series Analysis¶

Context: We observed that new user growth has gradually slowed over the last year. In the past few months, customer churn is up to 10% from last year. no obvious change in product or major bug.

Question: why are retailers churning?

Analysis Direction: We will discover the reasons for retailer churn by analyzing three dimensions:

Customer Analysis: New Users, Retention (repeated purchases) - Any trends YoY? Anything standout by Geo?
Order Analysis: Order volume (take into account seasonality) & value
Review Analysis: Any significant degrade in CSAT

# Debug code - to delete 

print(df.shape)
df_2016 = df.query('order_purchase_year == 2016')
print(df_2016.shape)

(116581, 53)
(388, 53)

Orders Volume and Sales Analysis¶

Observations: Order purchase was on a rise throughout 2017, and suddenly came to a plateau in 2018 and has been on a downward trend. Both order's volume and order's value have reduced in 2018

This reduction on order volume and sales happened very sudden at the start of 2018, across all geos

# Creating new datasets for each year 

df_2016 = df.query('order_purchase_year == 2016')
df_2017 = df.query('order_purchase_year == 2017')
df_2018 = df.query('order_purchase_year == 2018')

fig, axs = plt.subplots(1, 4, figsize = (22,10))
count_plot(feature = 'order_purchase_year', df = df, ax = axs[0], title = 'Total Order Purchase by Year')
count_plot(feature = 'order_purchase_year', df = df_2016, ax = axs[1], hue = 'order_purchase_month', title = 'Number of Orders by Month in 2016')
count_plot(feature = 'order_purchase_year', df = df_2017, ax = axs[2], hue = 'order_purchase_month', title = 'Number of Orders by Month in 2017')
count_plot(feature = 'order_purchase_year', df= df_2018, ax=axs[3], hue = 'order_purchase_month',title = 'Number of Orders by Month in 2018')

plt.suptitle('Score Counting Through the Years', y =1.1)
plt.show()

# Creating new datasets for each year 
df_2016= df.query('order_purchase_year == 2016')
df_2017 = df.query('order_purchase_year == 2017')
df_2018 = df.query('order_purchase_year == 2018')

df_ytsales = df.groupby(['order_purchase_year', 'order_purchase_month', 'year_month'], 
                        as_index = False).sum().loc[:, ['order_purchase_year', 'order_purchase_month',
                                                       'year_month', 'payment_value', 'order_count']]

df_ytsales_2016 = df_2016.groupby(['order_purchase_year', 'order_purchase_month'], 
                                  as_index = False).sum().loc[:,['order_purchase_year',
                                                                 'order_purchase_month', 'payment_value']]

df_ytsales_2017 = df_2017.groupby(['order_purchase_year', 'order_purchase_month'],
                                   as_index = False).sum().loc[:, ['order_purchase_year',
                                                                   'order_purchase_month', 'payment_value']]

df_ytsales_2018 = df_2018.groupby(['order_purchase_year', 'order_purchase_month'], 
                                  as_index = False).sum().loc[:, ['order_purchase_year', 
                                                                 'order_purchase_month', 'payment_value']]

fig, axs = plt.subplots(1, 3, figsize = (22,10))
bar_plot(x = 'order_purchase_year', y = 'payment_value', df = df_ytsales, ax = axs[0], value = True)
bar_plot(x = 'order_purchase_month', y ='payment_value', df = df_ytsales_2017, ax = axs[1], value = True)
bar_plot(x = 'order_purchase_month', y = 'payment_value', df = df_ytsales_2018, ax = axs[2], value = True)

axs[0].set_title('Yearly Sales from 2016 to 2018')
axs[1].set_title('Monthly Sales in 2017')
axs[2].set_title('Monthly Sales in 2018')

plt.suptitle('Order Payment Value Through the Years', y = 1.1)
plt.show()

fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x = 'year_month', y ='payment_value', data = df_ytsales)
bar_plot(x = 'year_month', y ='payment_value', df = df_ytsales, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Monthly Sales from 2016 to 2018')
plt.show()

fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x = 'year_month', y = 'order_count', data = df_ytsales)
bar_plot(x = 'year_month', y ='order_count', df = df_ytsales, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Monthly Order Volume from 2016 to 2018')
plt.show()

Analyze total customers by month-year: apart from seasonal factors, is there any problem?
Analyze churn by states (geo)

# Grouping by customer state

df_cus_count = df.groupby(['order_purchase_year', 
                           'order_purchase_month',
                           'year_month']).nunique().loc[:, ['customer_unique_id', 'seller_id']].reset_index()


df_cus_count.head(10)

fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x = 'year_month', y = 'customer_unique_id', data = df_cus_count)
bar_plot(x = 'year_month', y ='customer_unique_id', df = df_cus_count, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Number of Customers from 2016 to 2018')
plt.show()

df_cus_state = df.groupby(['customer_state',
                           'order_purchase_year']).sum()['payment_value'].reset_index().sort_values(
    by = 'payment_value', ascending =False)

df_cus_state.head(10)

Based on the above table, the biggest five stages: SP, RJ, MG, RS and PR. Their sale patterns are similar, on the decline, SP dominates the overall sale figure, so let's isolate it out**

fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.barplot(x = 'order_purchase_year', y ='payment_value', hue = 'customer_state', data = df_cus_state.head(10))
plt.show()

df_cus_state = df.groupby(['customer_state',
                           'year_month']).sum()['payment_value'].reset_index().sort_values('year_month')

df_cus_state.head(10)

top5 = ['SP', 'RJ', 'MG', 'RS', 'PR']
df_top5_state = df_cus_state.loc[df['customer_state'].isin(top5)].sort_values(by = 'year_month')
df_top5_state.head(3)

fig, axs = plt.subplots(figsize = (20, 4.5))
for state in top5: 
    axs = sns.lineplot(x = 'year_month', y = 'payment_value',
                       data = df_top5_state[df_top5_state['customer_state'] == state].sort_values(by = 'year_month'),
                       label = state)
    format_spines(axs, right_border = False)
    axs.set_title('Sales from the top 5 states from 2016 to 2018')

top4_noSP = ['RJ', 'MG', 'RS', 'PR']
df_top4_noSP = df_cus_state.loc[df_cus_state['customer_state'].isin(top4_noSP)]
df_top4_noSP.head(3)

fig, axs = plt.subplots(figsize = (20, 4.5))

for state in top4_noSP:
    axs = sns.lineplot(x = 'year_month', y ='payment_value', 
                       data = df_top4_noSP[df_top4_noSP['customer_state'] == state], label = state)
    format_spines(axs, right_border = False)
    axs.set_title('Sales from the top 4 states from 2016 to 2018, excluding SP')

All other top 4 states suffer from the same decline as SP. Since the sale decline happened 2018 across all staes, there could be only two reasons:

Competitors launch something in Brazil, attracting cusomers/sales to their sites. The top 3 states are all trending down in the past 3 to 4 months
There are some serious bugs/business strategy changes

Since the case assumed no major bugs, the first hypothesis is the most reasonable one. This hypothesis is further confirmed here "The company (Amazon) made its first big move into merchandise in October 2017, when it began offering the use of its Brazillian website to third party merchants to sell electornics

Sellers¶

The number of sellers is still increasing MoM, despite a reduced in order volume

fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x='year_month', y = 'seller_id', data = df_cus_count)
bar_plot(x='year_month', y ='seller_id', df = df_cus_count, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Number of Sellers from 2016 to df_2018')
plt.show()

Churn Analysis¶

#Create a dataframe to count how many times a customer shop

df_order = df.groupby(['order_id', 'year_month', 
                       'order_purchase_year', 'customer_unique_id'], 
                      as_index = False).sum().loc[:, ['order_id', 'customer_unique_id', 
                                                      'year_month', 'order_purchase_year', 
                                                      'payment_value']].sort_values(by='year_month', ascending = True)
df_order['time_to_shop'] = 1
df_order['time_to_shop'] = df_order.groupby(['customer_unique_id']).cumcount()+1 
df_order.head(10)

df_order_2016 = df_order[df_order['order_purchase_year'] == 2016]
df_order_2017 = df_order[df_order['order_purchase_year'] == 2017]
df_order_2018 = df_order[df_order['order_purchase_year'] == 2018]

df_count_cust = df_order.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 
                                    'payment_value', 'time_to_shop', 'order_purchase_year'], axis = 1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = 
                                                                                    {'customer_unique_id' : 'num_customer'})
df_count_cust['percentage_customer'] = 100*df_count_cust['num_customer']/df_count_cust['num_customer'].sum()
df_count_cust

96% of customers only buy with Olist once, which is a big problem

df_count_cust = df_order_2016.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 'payment_value', 'time_to_shop'], axis = 1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = {
    'customer_unique_id':'num_customer'})

df_count_cust["percentage_customer"] = 100.0 * df_count_cust["num_customer"] / df_count_cust["num_customer"].sum()
df_count_cust

df_count_cust = df_order_2017.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 'payment_value', 'time_to_shop'], axis =1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = {"customer_unique_id" : 'num_customer'})
df_count_cust['percentage_customer'] = 100*df_count_cust['num_customer']/df_count_cust['num_customer'].sum()
df_count_cust

df_count_cust = df_order_2018.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 'payment_value', 'time_to_shop'], axis =1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = {"customer_unique_id" : 'num_customer'})
df_count_cust['percentage_customer'] = 100*df_count_cust['num_customer']/df_count_cust['num_customer'].sum()
df_count_cust

Rootcause Analysis¶

Why 96% of customers shop with us only once? Key factors include

Price
Customer Experience - proxy by Review
Delivery Duration

Since we have review scores and order delivery time, we will focus here first

df_quality = df.groupby(['order_purchase_year', 'year_month'], as_index = False).mean().loc[:, ['order_purchase_year', 'year_month', 'expected_duration', 'ship_duration', 'tocarrier_duration', 'lastmile_duration', 'expected_vs_shipdate', 'review_score']]
df_quality.head(10)

df_quality = df.groupby(['order_purchase_year'], as_index = False).mean().loc[:, ['order_purchase_year', 'expected_duration', 'ship_duration', 'tocarrier_duration', 'lastmile_duration', 'expected_vs_shipdate', 'review_score']]
df_quality.head(10)

Although our review score í not too bad, we have a very long end-to-end ship duration. the review score is high becuase our expected delivery date is almost a month, so we beat it everytime!

fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize = (20,10), dpi =120)
df['expected_duration'].plot.hist(bins = 30, alpha = 1, ax = axs[0, 0])
axs[0, 0].set_title('Expected Ship Duration (days)')

df['ship_duration'].plot.hist(bins = 30, alpha = 1, ax = axs[1, 0])
axs[1, 0].set_title('End-to-End Ship Duration (days)')

df['tocarrier_duration'].plot.hist(bins = 30, alpha = 1, ax = axs[0, 1])
axs[0, 1].set_title('Middle mile lead-time: from retailers to carriers(days)')

df['lastmile_duration'].plot.hist(bins = 30, alpha = 1, ax  = axs[1,1])
axs[1, 1].set_title('Last mile lead-time: from carriers to customers (days)')

Text(0.5, 1.0, 'Last mile lead-time: from carriers to customers (days)')

#drop outliers to make the histograms clearer 
df_quality_chart_1 = df[df.expected_duration < 50] #drop expected duration more than 60 days from purchase date
df_quality_chart_2 = df[df.ship_duration < 50] # drop any end-to-end shiduration more than 60 days from purchase date 
df_quality_chart_3 = df[(df['tocarrier_duration'] < 30) & (df['tocarrier_duration']>0)] # drop any end-to-end ship duration more than 10 days from purchase date
df_quality_chart_4 = df[(df['lastmile_duration'] < 30) & (df['lastmile_duration'] > 0)] # drop any end-to-end ship duration more than 60 days from purchase date
df_quality_chart_5 = df[(df['expected_vs_shipdate'] < 30) & (df['expected_vs_shipdate'] > -30)] #drop any difference beyond 50 dats btw expected ship date and actual ship date

fig, axes = plt.subplots(nrows = 2, ncols =2, figsize = (20, 10), dpi = 120)


df_quality_chart_1['expected_duration'].plot.hist(bins=30, alpha = 1,ax = axes[0,0])
axes[0,0].set_title('Expected Ship Duration 2016-2018 (days)')

df_quality_chart_2['ship_duration'].plot.hist(bins=30, alpha = 1,ax = axes[1,0])
axes[1,0].set_title('End-to-End Ship Duration 2016-2018 (days)')

df_quality_chart_3['tocarrier_duration'].plot.hist(bins=20, alpha = 0.5,ax = axes[0,1])
axes[0,1].set_title('Middle mile lead-time: from retailers to carriers 2016-2018 (days)')

df_quality_chart_4['lastmile_duration'].plot.hist(bins=20, alpha = 0.5,ax = axes[1,1])
axes[1,1].set_title('Last mile lead-time: from carriers to customers 2016-2018 (days)')

#df_quality_chart['lastmile_duration'].plot.hist(bins=30, alpha = 1)

Text(0.5, 1.0, 'Last mile lead-time: from carriers to customers 2016-2018 (days)')

df_quality_chart_5['expected_vs_shipdate'].plot.hist(bins=15, alpha = 1)
plt.title("Difference between expected ship date and delivered date")

Text(0.5, 1.0, 'Difference between expected ship date and delivered date')

This delivery performance won't keep any customers. Average duration to ship is between 15-30 days!

df['review_score'].plot.hist(bins = 10, alpha = 1)
plt.title('Review Score 2016-2018')

Text(0.5, 1.0, 'Review Score 2016-2018')

	order_id	payment_sequential	payment_type	payment_installments	payment_value	order_item_id	product_id	seller_id	shipping_limit_date	price	freight_value	product_category_name	product_name_lenght	product_description_lenght	product_photos_qty	product_weight_g	product_length_cm	product_height_cm	product_width_cm	seller_zip_code_prefix	seller_city	seller_state	review_id	review_score	review_comment_title	review_comment_message	review_creation_date	review_answer_timestamp	customer_id	order_status	order_purchase_timestamp	order_approved_at	order_delivered_carrier_date	order_delivered_customer_date	order_estimated_delivery_date	product_category_name_english	customer_unique_id	customer_zip_code_prefix	customer_city	customer_state	order_purchase_year	order_purchase_month	order_purchase_dayofweek	order_purchase_hour	order_purchase_day	order_purchase_mon	order_count	year_month	ship_duration	tocarrier_duration	lastmile_duration	expected_vs_shipdate	expected_duration
0	b81ef226f3fe1789b1e8b2acac839d17	1	credit_card	8	99.33	1	af74cc53dcffc8384b29e7abfa41902b	213b25e6f54661939f11710a6fddb871	2018-05-02 22:15:09	79.80	19.53	casa_construcao	57.0	921.0	8.0	800.0	17.0	27.0	17.0	13321	salto	SP	4ce650e117a5b292bf5ea980a61eb61e	1	Não recomendo	Produto com qualidade ruim, amassado e descasc...	2018-05-10 00:00:00	2018-05-15 23:38:00	0a8556ac6be836b46b3e89920d59291c	delivered	2018-04-25 22:01:49	2018-04-25 22:15:09	2018-05-02 15:20:00	2018-05-09 17:36:51	2018-05-22	home_construction	708ab75d2a007f0564aedd11139c7708	39801	teofilo otoni	MG	2018	4	2	22	Wed	Apr	1	2018-04	13.0	6.0	7.0	12.0	26.0
1	d86305c284cecff56c7ee9d3fbd4f8d6	1	credit_card	4	93.72	1	af74cc53dcffc8384b29e7abfa41902b	213b25e6f54661939f11710a6fddb871	2018-04-19 23:10:02	79.80	13.92	casa_construcao	57.0	921.0	8.0	800.0	17.0	27.0	17.0	13321	salto	SP	ec9d84039245b981ebdbf7d16c723b6e	4	NaN	NaN	2018-04-21 00:00:00	2018-04-21 10:31:10	0e37385834260d6111727103a93e790a	delivered	2018-04-15 22:53:59	2018-04-15 23:10:32	2018-04-16 23:54:55	2018-04-20 00:07:39	2018-05-09	home_construction	550a1a0e8ad53e4147e6c761b5699c2b	14781	barretos	SP	2018	4	6	22	Sun	Apr	1	2018-04	4.0	1.0	3.0	18.0	23.0
2	5308fdab519f7478ef64894128fbbee4	1	credit_card	1	36.07	1	1b93af229a5768eaaed2064722b1c1dc	213b25e6f54661939f11710a6fddb871	2018-06-11 12:11:27	129.95	18.79	casa_construcao	57.0	915.0	5.0	9950.0	52.0	22.0	52.0	13321	salto	SP	8c4799d2bd4f23b35cf959e0cd301b4d	5	Produto muito bom.	NaN	2018-06-21 00:00:00	2018-06-24 20:43:00	3aec6e3846fd122ac430ac1a816107a8	delivered	2018-06-02 11:51:00	2018-06-02 12:11:27	2018-06-05 15:02:00	2018-06-20 23:09:42	2018-07-13	home_construction	74fe4e106702770ef728399ce5da503a	74823	goiania	GO	2018	6	5	11	Sat	Jun	1	2018-06	18.0	3.0	15.0	22.0	40.0

	order_purchase_year	order_purchase_month	year_month	customer_unique_id	seller_id
0	2016	9	2016-09	2	2
1	2016	10	2016-10	303	142
2	2016	12	2016-12	1	1
3	2017	1	2017-01	745	223
4	2017	2	2017-02	1659	411
5	2017	3	2017-03	2552	490
6	2017	4	2017-04	2296	495
7	2017	5	2017-05	3506	571
8	2017	6	2017-06	3099	524
9	2017	7	2017-07	3846	591

	customer_state	year_month	payment_value
434	RR	2016-09	272.46
452	RS	2016-09	75.06
495	SE	2016-10	345.92
99	CE	2016-10	1949.78
474	SC	2016-10	4834.53
120	DF	2016-10	1498.71
141	ES	2016-10	1067.14
453	RS	2016-10	5498.35
435	RR	2016-10	69.02
78	BA	2016-10	995.34

	customer_state	year_month	payment_value
434	RR	2016-09	272.46
452	RS	2016-09	75.06
20	AL	2016-10	129.90

	order_count	num_customer	percentage_customer
0	1	91238	96.971951
1	2	2617	2.781468
2	3	185	0.196627
3	4	30	0.031885
4	5	9	0.009566
5	6	3	0.003189
6	7	3	0.003189
7	9	1	0.001063
8	15	1	0.001063

Untitled

Nguyen_LSCM

12/4/2020

Time Series Analysis¶

Orders Volume and Sales Analysis¶

Sellers¶

Churn Analysis¶

Rootcause Analysis¶

	customer_state	order_purchase_year	payment_value
72	SP	2018	4363136.98
71	SP	2017	3170002.61
51	RJ	2017	1367630.83
52	RJ	2018	1366640.42
29	MG	2018	1231548.90
28	MG	2017	1071657.48
63	RS	2018	592221.03
49	PR	2018	589263.57
62	RS	2017	537690.87
48	PR	2017	463667.69

	order_count	num_customer	order_purchase_year	percentage_customer
0	1	41277	41277	97.186382
1	2	1102	1102	2.594651
2	3	75	75	0.176587
3	4	12	12	0.028254
4	5	4	4	0.009418
5	6	1	1	0.002354
6	8	1	1	0.002354

	order_count	num_customer	order_purchase_year	percentage_customer
0	1	50864	50864	97.873732
1	2	1038	1038	1.997345
2	3	56	56	0.107757
3	4	8	8	0.015394
4	5	1	1	0.001924
5	6	1	1	0.001924
6	7	1	1	0.001924

	order_purchase_year	year_month	expected_duration	ship_duration	tocarrier_duration	lastmile_duration	expected_vs_shipdate	review_score
0	2016	2016-09	47.333333	NaN	43.000000	NaN	NaN	1.000000
1	2016	2016-10	55.492188	19.702065	14.008621	5.498525	35.097345	3.578125
2	2016	2016-12	26.000000	4.000000	3.000000	1.000000	21.000000	5.000000
3	2017	2017-01	39.846535	12.145511	2.920892	8.841073	26.937049	4.094059
4	2017	2017-02	31.689176	12.655010	3.113344	9.099165	18.157620	4.046177
5	2017	2017-03	24.682514	12.450890	2.715769	9.319051	11.408372	4.052329
6	2017	2017-04	27.062522	14.160470	3.110499	10.564871	12.206753	3.955803
7	2017	2017-05	23.923520	10.863831	2.460258	7.913736	12.349810	4.117715
8	2017	2017-06	23.744143	11.731860	2.720936	8.593290	11.464812	4.124867
9	2017	2017-07	23.031802	11.016386	2.747952	7.789317	11.637795	4.110580

	order_purchase_year	expected_duration	ship_duration	tocarrier_duration	lastmile_duration	expected_vs_shipdate	review_score
0	2016	55.353093	19.655882	14.142450	5.485294	35.055882	3.561856
1	2017	24.388270	12.458088	3.001244	8.973969	11.449063	4.033041
2	2018	22.579155	11.613360	2.559633	8.582858	10.600364	4.007279