setwd('C:/Users/DellPC/Desktop/Corner/Py_source_code/Project/Olist')

htmltools::includeHTML('E-commerce Churn Analysis.html')
E-commerce Churn Analysis
In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
sns.set(style = 'ticks')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker

import os 
for dirname, _, filenames in os.walk(''):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import sklearn 
import scipy 
import warnings 
warnings.filterwarnings("ignore", category = DeprecationWarning)
pd.set_option('display.max_columns', 100)
%matplotlib inline
In [2]:
def format_spines(ax, right_border=True):
    
    ax.spines['bottom'].set_color('#666666')
    ax.spines['left'].set_color('#666666')
    ax.spines['top'].set_visible(False)
    if right_border:
        ax.spines['right'].set_color('#FFFFFF')
    else:
        ax.spines['right'].set_color('#FFFFFF')
    ax.patch.set_facecolor('#FFFFFF')

def count_plot(feature, df, colors='Greens_d', hue=False, ax=None, title=''):
    
    # Preparing variables
    ncount = len(df)
    if hue != False:
        ax = sns.countplot(x=feature, data=df, palette=colors, hue=hue, ax=ax)
    else:
        ax = sns.countplot(x=feature, data=df, palette=colors, ax=ax)
        
    format_spines(ax)
    
    # Setting percentage
    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
#        ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
#                ha='center', va='bottom') # set the alignment of the text
        ax.annotate(y, (x.mean(), y), 
                ha='center', va='bottom') # set the alignment of the text
    
    # Final configuration
    if not hue:
        ax.set_title(df[feature].describe().name + ' Analysis', size=13, pad=15)
    else:
        ax.set_title(df[feature].describe().name + ' Analysis by ' + hue, size=13, pad=15)  
    if title != '':
        ax.set_title(title)       
    plt.tight_layout()
    
    
def bar_plot(x, y, df, colors='Blues_d', hue=False, ax=None, value=False, title=''):
    
    # Preparing variables
    try:
        ncount = sum(df[y])
    except:
        ncount = sum(df[x])
    #fig, ax = plt.subplots()
    if hue != False:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, hue=hue, ax=ax, ci=None)
    else:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, ax=ax, ci=None)

    # Setting borders
    format_spines(ax)

    # Setting percentage
    for p in ax.patches:
        xp=p.get_bbox().get_points()[:,0]
        yp=p.get_bbox().get_points()[1,1]
        if value:
            ax.annotate('{:.2f}k'.format(yp/1000), (xp.mean(), yp), 
                    ha='center', va='bottom') # set the alignment of the text
        else:
            ax.annotate('{:.1f}%'.format(100.*yp/ncount), (xp.mean(), yp), 
                    ha='center', va='bottom') # set the alignment of the text
    if not hue:
        ax.set_title(df[x].describe().name + ' Analysis', size=12, pad=15)
    else:
        ax.set_title(df[x].describe().name + ' Analysis by ' + hue, size=12, pad=15)
    if title != '':
        ax.set_title(title)  
    plt.tight_layout()
    
    
def categorical_plot(cols_cat, axs, df):
    
    idx_row = 0
    for col in cols_cat:
        # Returning column index
        idx_col = cols_cat.index(col)

        # Verifying brake line in figure (second row)
        if idx_col >= 3:
            idx_col -= 3
            idx_row = 1

        # Plot params
        names = df[col].value_counts().index
        heights = df[col].value_counts().values

        # Bar chart
        axs[idx_row, idx_col].bar(names, heights, color='navy')
        if (idx_row, idx_col) == (0, 2):
            y_pos = np.arange(len(names))
            axs[idx_row, idx_col].tick_params(axis='x', labelrotation=30)
        if (idx_row, idx_col) == (1, 1):
            y_pos = np.arange(len(names))
            axs[idx_row, idx_col].tick_params(axis='x', labelrotation=90)

        total = df[col].value_counts().sum()
        axs[idx_row, idx_col].patch.set_facecolor('#FFFFFF')
        format_spines(axs[idx_row, idx_col], right_border=False)
        for p in axs[idx_row, idx_col].patches:
            w, h = p.get_width(), p.get_height()
            x, y = p.get_xy()
            axs[idx_row, idx_col].annotate('{:.1%}'.format(h/1000), (p.get_x()+.29*w,
                                            p.get_y()+h+20), color='k')

        # Plot configuration
        axs[idx_row, idx_col].set_title(col, size=12)
        axs[idx_row, idx_col].set_ylim(0, heights.max()+120)
In [3]:
#Load data

order = pd.read_csv("olist_orders_dataset.csv")
customer = pd.read_csv("olist_order_customer_dataset.csv")
review = pd.read_csv("olist_order_reviews_dataset.csv")
payment = pd.read_csv("olist_order_payments_dataset.csv")
order_item = pd.read_csv("olist_order_items_dataset.csv")
product = pd.read_csv("olist_products_dataset.csv")
seller = pd.read_csv('olist_sellers_dataset.csv')
geo = pd.read_csv('olist_geolocation_dataset.csv')
translation = pd.read_csv('product_category_name_translation.csv')
In [4]:
# Data Pre-processing 
# Translate Product Category

#Convert Timestamp Data

order_date = ['order_purchase_timestamp', u'order_approved_at', 
              u'order_delivered_carrier_date', u'order_delivered_customer_date', 
              u'order_delivered_customer_date', u'order_estimated_delivery_date']


for items in order_date:
    order[items] = pd.to_datetime(order[items], format = '%Y-%m-%d %H:%M:%S')
In [5]:
# creating master dataframe 

payment.head()
print(payment.shape)
df1 = payment.merge(order_item, on = 'order_id')
print(df1.shape)
df2 = df1.merge(product, on = 'product_id')
print(df2.shape)
df3 = df2.merge(seller, on = 'seller_id')
print(df3.shape)
df4 = df3.merge(review, on = 'order_id')
print(df4.shape)
df5 = df4.merge(order, on ='order_id')
print(df5.shape)
df6 = df5.merge(translation, on = 'product_category_name')
print(df6.shape)
df = df6.merge(customer, on = 'customer_id')
print(df.shape)
(103886, 5)
(117601, 11)
(117601, 19)
(117601, 22)
(118315, 28)
(118315, 35)
(116581, 36)
(116581, 40)
In [6]:
# cleaning up and re-engineering some columns

df['order_purchase_year'] = df.order_purchase_timestamp.apply(lambda x: x.year)
df['order_purchase_month'] = df.order_purchase_timestamp.apply(lambda x: x.month)
df['order_purchase_dayofweek'] = df.order_purchase_timestamp.apply(lambda x: x.dayofweek)
df['order_purchase_hour'] = df.order_purchase_timestamp.apply(lambda x: x.hour)
df['order_purchase_day'] = df['order_purchase_dayofweek'].map({0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4 : 'Fri', 5: 'Sat', 6:'Sun'})
df['order_purchase_mon'] = df['order_purchase_month'].map({1 : 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                                                           7:'Jul', 8: 'Aug', 9: 'Sep',10: 'Oct', 11: 'Nov', 12: 'Dec'})
df['order_count'] = 1
df['year_month'] = df['order_purchase_timestamp'].dt.strftime('%Y-%m')

df['ship_duration'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp'])/24
df['ship_duration'] = df['ship_duration'].astype('timedelta64[h]')

df['tocarrier_duration'] = (df['order_delivered_carrier_date'] - df['order_purchase_timestamp'])/24
df['tocarrier_duration'] = df['tocarrier_duration'].astype('timedelta64[h]')

df['lastmile_duration'] = (df['order_delivered_customer_date'] - df['order_delivered_carrier_date'])/24
df['lastmile_duration'] = df['lastmile_duration'].astype('timedelta64[h]')

df['expected_vs_shipdate'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date'])/24
df['expected_vs_shipdate'] = df['expected_vs_shipdate'].astype('timedelta64[h]')

df['expected_duration'] = (df['order_estimated_delivery_date'] - df['order_purchase_timestamp'])/24
df['expected_duration'] = df['expected_duration'].astype('timedelta64[h]')
In [7]:
df.head(3)
Out[7]:
order_id payment_sequential payment_type payment_installments payment_value order_item_id product_id seller_id shipping_limit_date price freight_value product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm seller_zip_code_prefix seller_city seller_state review_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date product_category_name_english customer_unique_id customer_zip_code_prefix customer_city customer_state order_purchase_year order_purchase_month order_purchase_dayofweek order_purchase_hour order_purchase_day order_purchase_mon order_count year_month ship_duration tocarrier_duration lastmile_duration expected_vs_shipdate expected_duration
0 b81ef226f3fe1789b1e8b2acac839d17 1 credit_card 8 99.33 1 af74cc53dcffc8384b29e7abfa41902b 213b25e6f54661939f11710a6fddb871 2018-05-02 22:15:09 79.80 19.53 casa_construcao 57.0 921.0 8.0 800.0 17.0 27.0 17.0 13321 salto SP 4ce650e117a5b292bf5ea980a61eb61e 1 Não recomendo Produto com qualidade ruim, amassado e descasc... 2018-05-10 00:00:00 2018-05-15 23:38:00 0a8556ac6be836b46b3e89920d59291c delivered 2018-04-25 22:01:49 2018-04-25 22:15:09 2018-05-02 15:20:00 2018-05-09 17:36:51 2018-05-22 home_construction 708ab75d2a007f0564aedd11139c7708 39801 teofilo otoni MG 2018 4 2 22 Wed Apr 1 2018-04 13.0 6.0 7.0 12.0 26.0
1 d86305c284cecff56c7ee9d3fbd4f8d6 1 credit_card 4 93.72 1 af74cc53dcffc8384b29e7abfa41902b 213b25e6f54661939f11710a6fddb871 2018-04-19 23:10:02 79.80 13.92 casa_construcao 57.0 921.0 8.0 800.0 17.0 27.0 17.0 13321 salto SP ec9d84039245b981ebdbf7d16c723b6e 4 NaN NaN 2018-04-21 00:00:00 2018-04-21 10:31:10 0e37385834260d6111727103a93e790a delivered 2018-04-15 22:53:59 2018-04-15 23:10:32 2018-04-16 23:54:55 2018-04-20 00:07:39 2018-05-09 home_construction 550a1a0e8ad53e4147e6c761b5699c2b 14781 barretos SP 2018 4 6 22 Sun Apr 1 2018-04 4.0 1.0 3.0 18.0 23.0
2 5308fdab519f7478ef64894128fbbee4 1 credit_card 1 36.07 1 1b93af229a5768eaaed2064722b1c1dc 213b25e6f54661939f11710a6fddb871 2018-06-11 12:11:27 129.95 18.79 casa_construcao 57.0 915.0 5.0 9950.0 52.0 22.0 52.0 13321 salto SP 8c4799d2bd4f23b35cf959e0cd301b4d 5 Produto muito bom. NaN 2018-06-21 00:00:00 2018-06-24 20:43:00 3aec6e3846fd122ac430ac1a816107a8 delivered 2018-06-02 11:51:00 2018-06-02 12:11:27 2018-06-05 15:02:00 2018-06-20 23:09:42 2018-07-13 home_construction 74fe4e106702770ef728399ce5da503a 74823 goiania GO 2018 6 5 11 Sat Jun 1 2018-06 18.0 3.0 15.0 22.0 40.0

Time Series Analysis

Context: We observed that new user growth has gradually slowed over the last year. In the past few months, customer churn is up to 10% from last year. no obvious change in product or major bug.

Question: why are retailers churning?

Analysis Direction: We will discover the reasons for retailer churn by analyzing three dimensions:

  1. Customer Analysis: New Users, Retention (repeated purchases) - Any trends YoY? Anything standout by Geo?
  2. Order Analysis: Order volume (take into account seasonality) & value
  3. Review Analysis: Any significant degrade in CSAT
In [8]:
# Debug code - to delete 

print(df.shape)
df_2016 = df.query('order_purchase_year == 2016')
print(df_2016.shape)
(116581, 53)
(388, 53)

Orders Volume and Sales Analysis

Observations: Order purchase was on a rise throughout 2017, and suddenly came to a plateau in 2018 and has been on a downward trend. Both order's volume and order's value have reduced in 2018

This reduction on order volume and sales happened very sudden at the start of 2018, across all geos

In [9]:
# Creating new datasets for each year 

df_2016 = df.query('order_purchase_year == 2016')
df_2017 = df.query('order_purchase_year == 2017')
df_2018 = df.query('order_purchase_year == 2018')

fig, axs = plt.subplots(1, 4, figsize = (22,10))
count_plot(feature = 'order_purchase_year', df = df, ax = axs[0], title = 'Total Order Purchase by Year')
count_plot(feature = 'order_purchase_year', df = df_2016, ax = axs[1], hue = 'order_purchase_month', title = 'Number of Orders by Month in 2016')
count_plot(feature = 'order_purchase_year', df = df_2017, ax = axs[2], hue = 'order_purchase_month', title = 'Number of Orders by Month in 2017')
count_plot(feature = 'order_purchase_year', df= df_2018, ax=axs[3], hue = 'order_purchase_month',title = 'Number of Orders by Month in 2018')

plt.suptitle('Score Counting Through the Years', y =1.1)
plt.show()
In [10]:
# Creating new datasets for each year 
df_2016= df.query('order_purchase_year == 2016')
df_2017 = df.query('order_purchase_year == 2017')
df_2018 = df.query('order_purchase_year == 2018')

df_ytsales = df.groupby(['order_purchase_year', 'order_purchase_month', 'year_month'], 
                        as_index = False).sum().loc[:, ['order_purchase_year', 'order_purchase_month',
                                                       'year_month', 'payment_value', 'order_count']]

df_ytsales_2016 = df_2016.groupby(['order_purchase_year', 'order_purchase_month'], 
                                  as_index = False).sum().loc[:,['order_purchase_year',
                                                                 'order_purchase_month', 'payment_value']]

df_ytsales_2017 = df_2017.groupby(['order_purchase_year', 'order_purchase_month'],
                                   as_index = False).sum().loc[:, ['order_purchase_year',
                                                                   'order_purchase_month', 'payment_value']]

df_ytsales_2018 = df_2018.groupby(['order_purchase_year', 'order_purchase_month'], 
                                  as_index = False).sum().loc[:, ['order_purchase_year', 
                                                                 'order_purchase_month', 'payment_value']]

fig, axs = plt.subplots(1, 3, figsize = (22,10))
bar_plot(x = 'order_purchase_year', y = 'payment_value', df = df_ytsales, ax = axs[0], value = True)
bar_plot(x = 'order_purchase_month', y ='payment_value', df = df_ytsales_2017, ax = axs[1], value = True)
bar_plot(x = 'order_purchase_month', y = 'payment_value', df = df_ytsales_2018, ax = axs[2], value = True)

axs[0].set_title('Yearly Sales from 2016 to 2018')
axs[1].set_title('Monthly Sales in 2017')
axs[2].set_title('Monthly Sales in 2018')

plt.suptitle('Order Payment Value Through the Years', y = 1.1)
plt.show()
In [11]:
fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x = 'year_month', y ='payment_value', data = df_ytsales)
bar_plot(x = 'year_month', y ='payment_value', df = df_ytsales, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Monthly Sales from 2016 to 2018')
plt.show()
In [12]:
fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x = 'year_month', y = 'order_count', data = df_ytsales)
bar_plot(x = 'year_month', y ='order_count', df = df_ytsales, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Monthly Order Volume from 2016 to 2018')
plt.show()
  1. Analyze total customers by month-year: apart from seasonal factors, is there any problem?
  2. Analyze churn by states (geo)
In [13]:
# Grouping by customer state

df_cus_count = df.groupby(['order_purchase_year', 
                           'order_purchase_month',
                           'year_month']).nunique().loc[:, ['customer_unique_id', 'seller_id']].reset_index()


df_cus_count.head(10)
Out[13]:
order_purchase_year order_purchase_month year_month customer_unique_id seller_id
0 2016 9 2016-09 2 2
1 2016 10 2016-10 303 142
2 2016 12 2016-12 1 1
3 2017 1 2017-01 745 223
4 2017 2 2017-02 1659 411
5 2017 3 2017-03 2552 490
6 2017 4 2017-04 2296 495
7 2017 5 2017-05 3506 571
8 2017 6 2017-06 3099 524
9 2017 7 2017-07 3846 591
In [14]:
fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x = 'year_month', y = 'customer_unique_id', data = df_cus_count)
bar_plot(x = 'year_month', y ='customer_unique_id', df = df_cus_count, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Number of Customers from 2016 to 2018')
plt.show()
In [15]:
df_cus_state = df.groupby(['customer_state',
                           'order_purchase_year']).sum()['payment_value'].reset_index().sort_values(
    by = 'payment_value', ascending =False)

df_cus_state.head(10)
Out[15]:
customer_state order_purchase_year payment_value
72 SP 2018 4363136.98
71 SP 2017 3170002.61
51 RJ 2017 1367630.83
52 RJ 2018 1366640.42
29 MG 2018 1231548.90
28 MG 2017 1071657.48
63 RS 2018 592221.03
49 PR 2018 589263.57
62 RS 2017 537690.87
48 PR 2017 463667.69

Based on the above table, the biggest five stages: SP, RJ, MG, RS and PR. Their sale patterns are similar, on the decline, SP dominates the overall sale figure, so let's isolate it out**

In [16]:
fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.barplot(x = 'order_purchase_year', y ='payment_value', hue = 'customer_state', data = df_cus_state.head(10))
plt.show()
In [17]:
df_cus_state = df.groupby(['customer_state',
                           'year_month']).sum()['payment_value'].reset_index().sort_values('year_month')

df_cus_state.head(10)
Out[17]:
customer_state year_month payment_value
434 RR 2016-09 272.46
452 RS 2016-09 75.06
495 SE 2016-10 345.92
99 CE 2016-10 1949.78
474 SC 2016-10 4834.53
120 DF 2016-10 1498.71
141 ES 2016-10 1067.14
453 RS 2016-10 5498.35
435 RR 2016-10 69.02
78 BA 2016-10 995.34
In [18]:
top5 = ['SP', 'RJ', 'MG', 'RS', 'PR']
df_top5_state = df_cus_state.loc[df['customer_state'].isin(top5)].sort_values(by = 'year_month')
df_top5_state.head(3)
Out[18]:
customer_state year_month payment_value
434 RR 2016-09 272.46
452 RS 2016-09 75.06
20 AL 2016-10 129.90
In [19]:
fig, axs = plt.subplots(figsize = (20, 4.5))
for state in top5: 
    axs = sns.lineplot(x = 'year_month', y = 'payment_value',
                       data = df_top5_state[df_top5_state['customer_state'] == state].sort_values(by = 'year_month'),
                       label = state)
    format_spines(axs, right_border = False)
    axs.set_title('Sales from the top 5 states from 2016 to 2018')
In [20]:
top4_noSP = ['RJ', 'MG', 'RS', 'PR']
df_top4_noSP = df_cus_state.loc[df_cus_state['customer_state'].isin(top4_noSP)]
df_top4_noSP.head(3)

fig, axs = plt.subplots(figsize = (20, 4.5))

for state in top4_noSP:
    axs = sns.lineplot(x = 'year_month', y ='payment_value', 
                       data = df_top4_noSP[df_top4_noSP['customer_state'] == state], label = state)
    format_spines(axs, right_border = False)
    axs.set_title('Sales from the top 4 states from 2016 to 2018, excluding SP')

All other top 4 states suffer from the same decline as SP. Since the sale decline happened 2018 across all staes, there could be only two reasons:

  • Competitors launch something in Brazil, attracting cusomers/sales to their sites. The top 3 states are all trending down in the past 3 to 4 months
  • There are some serious bugs/business strategy changes

Since the case assumed no major bugs, the first hypothesis is the most reasonable one. This hypothesis is further confirmed here "The company (Amazon) made its first big move into merchandise in October 2017, when it began offering the use of its Brazillian website to third party merchants to sell electornics

Sellers

The number of sellers is still increasing MoM, despite a reduced in order volume

In [21]:
fig, axs = plt.subplots(figsize = (20, 4.5))
axs = sns.lineplot(x='year_month', y = 'seller_id', data = df_cus_count)
bar_plot(x='year_month', y ='seller_id', df = df_cus_count, value = True)
format_spines(axs, right_border = False)
axs.set_title('Brazillian E-commerce Number of Sellers from 2016 to df_2018')
plt.show()

Churn Analysis

In [22]:
#Create a dataframe to count how many times a customer shop

df_order = df.groupby(['order_id', 'year_month', 
                       'order_purchase_year', 'customer_unique_id'], 
                      as_index = False).sum().loc[:, ['order_id', 'customer_unique_id', 
                                                      'year_month', 'order_purchase_year', 
                                                      'payment_value']].sort_values(by='year_month', ascending = True)
df_order['time_to_shop'] = 1
df_order['time_to_shop'] = df_order.groupby(['customer_unique_id']).cumcount()+1 
df_order.head(10)

df_order_2016 = df_order[df_order['order_purchase_year'] == 2016]
df_order_2017 = df_order[df_order['order_purchase_year'] == 2017]
df_order_2018 = df_order[df_order['order_purchase_year'] == 2018]
In [23]:
df_count_cust = df_order.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 
                                    'payment_value', 'time_to_shop', 'order_purchase_year'], axis = 1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = 
                                                                                    {'customer_unique_id' : 'num_customer'})
df_count_cust['percentage_customer'] = 100*df_count_cust['num_customer']/df_count_cust['num_customer'].sum()
df_count_cust
Out[23]:
order_count num_customer percentage_customer
0 1 91238 96.971951
1 2 2617 2.781468
2 3 185 0.196627
3 4 30 0.031885
4 5 9 0.009566
5 6 3 0.003189
6 7 3 0.003189
7 9 1 0.001063
8 15 1 0.001063

96% of customers only buy with Olist once, which is a big problem

In [24]:
df_count_cust = df_order_2016.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 'payment_value', 'time_to_shop'], axis = 1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = {
    'customer_unique_id':'num_customer'})

df_count_cust["percentage_customer"] = 100.0 * df_count_cust["num_customer"] / df_count_cust["num_customer"].sum()
df_count_cust
Out[24]:
order_count num_customer order_purchase_year percentage_customer
0 1 303 303 99.019608
1 2 3 3 0.980392
In [25]:
df_count_cust = df_order_2017.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 'payment_value', 'time_to_shop'], axis =1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = {"customer_unique_id" : 'num_customer'})
df_count_cust['percentage_customer'] = 100*df_count_cust['num_customer']/df_count_cust['num_customer'].sum()
df_count_cust
Out[25]:
order_count num_customer order_purchase_year percentage_customer
0 1 41277 41277 97.186382
1 2 1102 1102 2.594651
2 3 75 75 0.176587
3 4 12 12 0.028254
4 5 4 4 0.009418
5 6 1 1 0.002354
6 8 1 1 0.002354
In [26]:
df_count_cust = df_order_2018.groupby(['customer_unique_id']).count().reset_index()
df_count_cust['order_count'] = df_count_cust['order_id']
df_count_cust = df_count_cust.drop(['order_id', 'year_month', 'payment_value', 'time_to_shop'], axis =1)
df_count_cust = df_count_cust.groupby(['order_count']).count().reset_index().rename(columns = {"customer_unique_id" : 'num_customer'})
df_count_cust['percentage_customer'] = 100*df_count_cust['num_customer']/df_count_cust['num_customer'].sum()
df_count_cust
Out[26]:
order_count num_customer order_purchase_year percentage_customer
0 1 50864 50864 97.873732
1 2 1038 1038 1.997345
2 3 56 56 0.107757
3 4 8 8 0.015394
4 5 1 1 0.001924
5 6 1 1 0.001924
6 7 1 1 0.001924

Rootcause Analysis

Why 96% of customers shop with us only once? Key factors include

  • Price
  • Customer Experience - proxy by Review
  • Delivery Duration

Since we have review scores and order delivery time, we will focus here first

In [27]:
df_quality = df.groupby(['order_purchase_year', 'year_month'], as_index = False).mean().loc[:, ['order_purchase_year', 'year_month', 'expected_duration', 'ship_duration', 'tocarrier_duration', 'lastmile_duration', 'expected_vs_shipdate', 'review_score']]
df_quality.head(10)
Out[27]:
order_purchase_year year_month expected_duration ship_duration tocarrier_duration lastmile_duration expected_vs_shipdate review_score
0 2016 2016-09 47.333333 NaN 43.000000 NaN NaN 1.000000
1 2016 2016-10 55.492188 19.702065 14.008621 5.498525 35.097345 3.578125
2 2016 2016-12 26.000000 4.000000 3.000000 1.000000 21.000000 5.000000
3 2017 2017-01 39.846535 12.145511 2.920892 8.841073 26.937049 4.094059
4 2017 2017-02 31.689176 12.655010 3.113344 9.099165 18.157620 4.046177
5 2017 2017-03 24.682514 12.450890 2.715769 9.319051 11.408372 4.052329
6 2017 2017-04 27.062522 14.160470 3.110499 10.564871 12.206753 3.955803
7 2017 2017-05 23.923520 10.863831 2.460258 7.913736 12.349810 4.117715
8 2017 2017-06 23.744143 11.731860 2.720936 8.593290 11.464812 4.124867
9 2017 2017-07 23.031802 11.016386 2.747952 7.789317 11.637795 4.110580
In [28]:
df_quality = df.groupby(['order_purchase_year'], as_index = False).mean().loc[:, ['order_purchase_year', 'expected_duration', 'ship_duration', 'tocarrier_duration', 'lastmile_duration', 'expected_vs_shipdate', 'review_score']]
df_quality.head(10)
Out[28]:
order_purchase_year expected_duration ship_duration tocarrier_duration lastmile_duration expected_vs_shipdate review_score
0 2016 55.353093 19.655882 14.142450 5.485294 35.055882 3.561856
1 2017 24.388270 12.458088 3.001244 8.973969 11.449063 4.033041
2 2018 22.579155 11.613360 2.559633 8.582858 10.600364 4.007279

Although our review score í not too bad, we have a very long end-to-end ship duration. the review score is high becuase our expected delivery date is almost a month, so we beat it everytime!

In [31]:
fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize = (20,10), dpi =120)
df['expected_duration'].plot.hist(bins = 30, alpha = 1, ax = axs[0, 0])
axs[0, 0].set_title('Expected Ship Duration (days)')

df['ship_duration'].plot.hist(bins = 30, alpha = 1, ax = axs[1, 0])
axs[1, 0].set_title('End-to-End Ship Duration (days)')

df['tocarrier_duration'].plot.hist(bins = 30, alpha = 1, ax = axs[0, 1])
axs[0, 1].set_title('Middle mile lead-time: from retailers to carriers(days)')

df['lastmile_duration'].plot.hist(bins = 30, alpha = 1, ax  = axs[1,1])
axs[1, 1].set_title('Last mile lead-time: from carriers to customers (days)')
Out[31]:
Text(0.5, 1.0, 'Last mile lead-time: from carriers to customers (days)')
In [41]:
#drop outliers to make the histograms clearer 
df_quality_chart_1 = df[df.expected_duration < 50] #drop expected duration more than 60 days from purchase date
df_quality_chart_2 = df[df.ship_duration < 50] # drop any end-to-end shiduration more than 60 days from purchase date 
df_quality_chart_3 = df[(df['tocarrier_duration'] < 30) & (df['tocarrier_duration']>0)] # drop any end-to-end ship duration more than 10 days from purchase date
df_quality_chart_4 = df[(df['lastmile_duration'] < 30) & (df['lastmile_duration'] > 0)] # drop any end-to-end ship duration more than 60 days from purchase date
df_quality_chart_5 = df[(df['expected_vs_shipdate'] < 30) & (df['expected_vs_shipdate'] > -30)] #drop any difference beyond 50 dats btw expected ship date and actual ship date

fig, axes = plt.subplots(nrows = 2, ncols =2, figsize = (20, 10), dpi = 120)


df_quality_chart_1['expected_duration'].plot.hist(bins=30, alpha = 1,ax = axes[0,0])
axes[0,0].set_title('Expected Ship Duration 2016-2018 (days)')

df_quality_chart_2['ship_duration'].plot.hist(bins=30, alpha = 1,ax = axes[1,0])
axes[1,0].set_title('End-to-End Ship Duration 2016-2018 (days)')

df_quality_chart_3['tocarrier_duration'].plot.hist(bins=20, alpha = 0.5,ax = axes[0,1])
axes[0,1].set_title('Middle mile lead-time: from retailers to carriers 2016-2018 (days)')

df_quality_chart_4['lastmile_duration'].plot.hist(bins=20, alpha = 0.5,ax = axes[1,1])
axes[1,1].set_title('Last mile lead-time: from carriers to customers 2016-2018 (days)')

#df_quality_chart['lastmile_duration'].plot.hist(bins=30, alpha = 1)
Out[41]:
Text(0.5, 1.0, 'Last mile lead-time: from carriers to customers 2016-2018 (days)')
In [42]:
df_quality_chart_5['expected_vs_shipdate'].plot.hist(bins=15, alpha = 1)
plt.title("Difference between expected ship date and delivered date")
Out[42]:
Text(0.5, 1.0, 'Difference between expected ship date and delivered date')

This delivery performance won't keep any customers. Average duration to ship is between 15-30 days!

In [43]:
df['review_score'].plot.hist(bins = 10, alpha = 1)
plt.title('Review Score 2016-2018')
Out[43]:
Text(0.5, 1.0, 'Review Score 2016-2018')
In [ ]: