Product Sales Analyzed Using Python¶

The product sale data is analyzed using Python.

Loading and checking data¶

In [1]:
import pandas as pd
df= pd.read_csv("/Users/nnthieu/sales.csv", skiprows=0)
In [24]:
df.shape
Out[24]:
(15000, 8)
In [25]:
df.head()
Out[25]:
week sales_method customer_id nb_sold revenue years_as_customer nb_site_visits state
0 2 Email 2e72d641-95ac-497b-bbf8-4861764a7097 10 NaN 0 24 Arizona
1 6 Email + Call 3998a98d-70f5-44f7-942e-789bb8ad2fe7 15 225.47 1 28 Kansas
2 5 Call d1de9884-8059-4065-b10f-86eef57e4a44 11 52.55 6 26 Wisconsin
3 4 Email 78aa75a4-ffeb-4817-b1d0-2f030783c5d7 11 NaN 3 25 Indiana
4 3 Email 10e6d446-10a5-42e5-8210-1b5438f70922 9 90.49 0 28 Illinois
In [26]:
print(df.columns)
Index(['week', 'sales_method', 'customer_id', 'nb_sold', 'revenue',
       'years_as_customer', 'nb_site_visits', 'state'],
      dtype='object')
In [28]:
df.describe()
Out[28]:
week nb_sold revenue years_as_customer nb_site_visits
count 15000.000000 15000.000000 13926.000000 15000.000000 15000.000000
mean 3.098267 10.084667 93.934943 4.965933 24.990867
std 1.656420 1.812213 47.435312 5.044952 3.500914
min 1.000000 7.000000 32.540000 0.000000 12.000000
25% 2.000000 9.000000 52.470000 1.000000 23.000000
50% 3.000000 10.000000 89.500000 3.000000 25.000000
75% 5.000000 11.000000 107.327500 7.000000 27.000000
max 6.000000 16.000000 238.320000 63.000000 41.000000
In [29]:
df['sales_method'].unique()
Out[29]:
array(['Email', 'Email + Call', 'Call', 'em + call', 'email'],
      dtype=object)

Some errors such as 'email', 'em + call' should be corrected into 'Email' and 'Email + Call', replacing NA in 'revenue', and correcting the values greater than 40 in 'years_of_customer'.

Cleaning data¶

In [2]:
repl_dict = {'em + call': 'Email + Call', 'email': 'Email'}
df['sales_method'] = df['sales_method'].replace(repl_dict)
df["sales_method"].unique()
Out[2]:
array(['Email', 'Email + Call', 'Call'], dtype=object)
In [3]:
df['revenue']=df['revenue'].fillna(93.93)
df.head()
Out[3]:
week sales_method customer_id nb_sold revenue years_as_customer nb_site_visits state
0 2 Email 2e72d641-95ac-497b-bbf8-4861764a7097 10 93.93 0 24 Arizona
1 6 Email + Call 3998a98d-70f5-44f7-942e-789bb8ad2fe7 15 225.47 1 28 Kansas
2 5 Call d1de9884-8059-4065-b10f-86eef57e4a44 11 52.55 6 26 Wisconsin
3 4 Email 78aa75a4-ffeb-4817-b1d0-2f030783c5d7 11 93.93 3 25 Indiana
4 3 Email 10e6d446-10a5-42e5-8210-1b5438f70922 9 90.49 0 28 Illinois
In [4]:
df['years_as_customer'] = df['years_as_customer'].apply(lambda x: 5 if x > 40 else x)

Summarizing data¶

In [5]:
revenue_by_method = df.groupby('sales_method')['revenue'].sum()
print(revenue_by_method)
sales_method
Call            244564.82
Email           723415.75
Email + Call    441038.26
Name: revenue, dtype: float64
In [6]:
sales_method_counts = df['sales_method'].value_counts()
print(sales_method_counts)
Email           7466
Call            4962
Email + Call    2572
Name: sales_method, dtype: int64
In [62]:
df.describe()
Out[62]:
week nb_sold revenue years_as_customer nb_site_visits
count 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000
mean 3.098267 10.084667 93.934589 4.959267 24.990867
std 1.656420 1.812213 45.705474 5.010903 3.500914
min 1.000000 7.000000 32.540000 0.000000 12.000000
25% 2.000000 9.000000 53.040000 1.000000 23.000000
50% 3.000000 10.000000 91.865000 3.000000 25.000000
75% 5.000000 11.000000 106.070000 7.000000 27.000000
max 6.000000 16.000000 238.320000 39.000000 41.000000
In [42]:
mean_revenue_by_week = df2.groupby('week')['revenue'].mean().round(2).reset_index()
print(mean_revenue_by_week)
   week  revenue
0     1    78.01
1     2    85.26
2     3    81.43
3     4    98.73
4     5   107.65
5     6   148.82

Charts¶

In [34]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.barplot(x='week', y='revenue', data=df)

# Add labels to the bars
ax = sns.barplot(x='week', y='revenue', data=df)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.2f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

# Add labels and title
plt.xlabel('Week')
plt.ylabel('Average Revenue')
plt.title('Average Revenue by Week')

# Show the plot
plt.show()
In [43]:
sum_revenue_by_week = df.groupby('week')['revenue'].sum().reset_index()
print(sum_revenue_by_week)
   week    revenue
0     1  293850.38
1     2  213840.06
2     3  198241.77
3     4  253337.40
4     5  274238.72
5     6  175510.50
In [36]:
from matplotlib.ticker import FuncFormatter

# Aggregate data to get the sum of revenue by week
df_agg = df.groupby('week')['revenue'].sum().reset_index()

# Create the bar plot using Seaborn
ax = sns.barplot(x='week', y='revenue', data=df_agg)

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(format_thousands))


# Add values to the bars
for p in ax.patches:
    ax.annotate(format(p.get_height(), ',.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by Week')

# Show the plot
plt.show()
In [37]:
from matplotlib.ticker import FuncFormatter

# Aggregate data to get the sum of revenue by week
df_agg = df.groupby('week')['revenue'].sum().reset_index()

# Plot the sum of revenue by week using Seaborn
sns.lineplot(x='week', y='revenue', data=df_agg, marker='o')

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_thousands))


# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by Week')

# Show the plot
plt.grid(True)
plt.show()

Top 10 states having most orders and most revenue for the company.

In [113]:
state_counts = df['state'].value_counts().reset_index().rename(columns={'index':'state', 'state':'count'})
print(state_counts.head(10))
            state  count
0      California   1872
1           Texas   1187
2        New York    965
3         Florida    904
4        Illinois    617
5    Pennsylvania    598
6            Ohio    566
7        Michigan    498
8         Georgia    489
9  North Carolina    459
In [38]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Aggregate data to get the sum of revenue by week
df2_agg = df.groupby('state')['revenue'].sum().reset_index().head(10)

df2_agg = df2_agg.sort_values(by='revenue', ascending=False)
print(df2_agg)

# Create the bar plot using Seaborn
ax = sns.barplot(x='state', y='revenue', data=df2_agg)

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(format_thousands))
plt.xticks(rotation=90)

# Add values to the bars
for p in ax.patches:
    ax.annotate(format(p.get_height(), ',.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

# Add labels and title
plt.xlabel('State')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by State')

# Show the plot
plt.show()
         state    revenue
4   California  173533.58
8      Florida   84978.29
9      Georgia   46150.38
2      Arizona   29643.36
5     Colorado   21236.22
0      Alabama   20711.02
6  Connecticut   17479.81
3     Arkansas   12110.40
1       Alaska    3669.64
7     Delaware    3438.33

California is the first state bring about most revenue for the company. Let look at the revenue by week of California.

In [10]:
Cali_df = df.query('state == "California"')
Cali_df.head()
Out[10]:
week sales_method customer_id nb_sold revenue years_as_customer nb_site_visits state
13 2 Email 6103bcac-9da6-4000-a0ce-fa2615cce846 10 101.54 1 28 California
42 3 Email 0fe31def-6ac2-4559-ba9a-23d65078eabf 9 86.51 3 25 California
46 1 Email a46ed8d0-f927-4fa8-a1db-19108a236abe 10 95.02 4 24 California
69 4 Email f57fcf83-e103-4c21-a110-e943c77a0a03 11 109.52 7 20 California
81 2 Call 7f2244c1-bf0b-4afc-8b43-206a087633d9 10 48.98 1 27 California
In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
Cal_df = df.query('state == "California"')

# Aggregate data to get the sum of revenue by week
Cal_df_agg = Cal_df.groupby('week')['revenue'].sum().reset_index()

# Plot the sum of revenue by week using Seaborn
sns.lineplot(x='week', y='revenue', data=Cal_df_agg, marker='o')

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_thousands))


# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('California Total Revenue by Week')

# Show the plot
plt.grid(True)
plt.show()
In [ ]: