The product sale data is analyzed using Python.
import pandas as pd
df= pd.read_csv("/Users/nnthieu/sales.csv", skiprows=0)
df.shape
(15000, 8)
df.head()
week | sales_method | customer_id | nb_sold | revenue | years_as_customer | nb_site_visits | state | |
---|---|---|---|---|---|---|---|---|
0 | 2 | 2e72d641-95ac-497b-bbf8-4861764a7097 | 10 | NaN | 0 | 24 | Arizona | |
1 | 6 | Email + Call | 3998a98d-70f5-44f7-942e-789bb8ad2fe7 | 15 | 225.47 | 1 | 28 | Kansas |
2 | 5 | Call | d1de9884-8059-4065-b10f-86eef57e4a44 | 11 | 52.55 | 6 | 26 | Wisconsin |
3 | 4 | 78aa75a4-ffeb-4817-b1d0-2f030783c5d7 | 11 | NaN | 3 | 25 | Indiana | |
4 | 3 | 10e6d446-10a5-42e5-8210-1b5438f70922 | 9 | 90.49 | 0 | 28 | Illinois |
print(df.columns)
Index(['week', 'sales_method', 'customer_id', 'nb_sold', 'revenue', 'years_as_customer', 'nb_site_visits', 'state'], dtype='object')
df.describe()
week | nb_sold | revenue | years_as_customer | nb_site_visits | |
---|---|---|---|---|---|
count | 15000.000000 | 15000.000000 | 13926.000000 | 15000.000000 | 15000.000000 |
mean | 3.098267 | 10.084667 | 93.934943 | 4.965933 | 24.990867 |
std | 1.656420 | 1.812213 | 47.435312 | 5.044952 | 3.500914 |
min | 1.000000 | 7.000000 | 32.540000 | 0.000000 | 12.000000 |
25% | 2.000000 | 9.000000 | 52.470000 | 1.000000 | 23.000000 |
50% | 3.000000 | 10.000000 | 89.500000 | 3.000000 | 25.000000 |
75% | 5.000000 | 11.000000 | 107.327500 | 7.000000 | 27.000000 |
max | 6.000000 | 16.000000 | 238.320000 | 63.000000 | 41.000000 |
df['sales_method'].unique()
array(['Email', 'Email + Call', 'Call', 'em + call', 'email'], dtype=object)
Some errors such as 'email', 'em + call' should be corrected into 'Email' and 'Email + Call', replacing NA in 'revenue', and correcting the values greater than 40 in 'years_of_customer'.
repl_dict = {'em + call': 'Email + Call', 'email': 'Email'}
df['sales_method'] = df['sales_method'].replace(repl_dict)
df["sales_method"].unique()
array(['Email', 'Email + Call', 'Call'], dtype=object)
df['revenue']=df['revenue'].fillna(93.93)
df.head()
week | sales_method | customer_id | nb_sold | revenue | years_as_customer | nb_site_visits | state | |
---|---|---|---|---|---|---|---|---|
0 | 2 | 2e72d641-95ac-497b-bbf8-4861764a7097 | 10 | 93.93 | 0 | 24 | Arizona | |
1 | 6 | Email + Call | 3998a98d-70f5-44f7-942e-789bb8ad2fe7 | 15 | 225.47 | 1 | 28 | Kansas |
2 | 5 | Call | d1de9884-8059-4065-b10f-86eef57e4a44 | 11 | 52.55 | 6 | 26 | Wisconsin |
3 | 4 | 78aa75a4-ffeb-4817-b1d0-2f030783c5d7 | 11 | 93.93 | 3 | 25 | Indiana | |
4 | 3 | 10e6d446-10a5-42e5-8210-1b5438f70922 | 9 | 90.49 | 0 | 28 | Illinois |
df['years_as_customer'] = df['years_as_customer'].apply(lambda x: 5 if x > 40 else x)
revenue_by_method = df.groupby('sales_method')['revenue'].sum()
print(revenue_by_method)
sales_method Call 244564.82 Email 723415.75 Email + Call 441038.26 Name: revenue, dtype: float64
sales_method_counts = df['sales_method'].value_counts()
print(sales_method_counts)
Email 7466 Call 4962 Email + Call 2572 Name: sales_method, dtype: int64
df.describe()
week | nb_sold | revenue | years_as_customer | nb_site_visits | |
---|---|---|---|---|---|
count | 15000.000000 | 15000.000000 | 15000.000000 | 15000.000000 | 15000.000000 |
mean | 3.098267 | 10.084667 | 93.934589 | 4.959267 | 24.990867 |
std | 1.656420 | 1.812213 | 45.705474 | 5.010903 | 3.500914 |
min | 1.000000 | 7.000000 | 32.540000 | 0.000000 | 12.000000 |
25% | 2.000000 | 9.000000 | 53.040000 | 1.000000 | 23.000000 |
50% | 3.000000 | 10.000000 | 91.865000 | 3.000000 | 25.000000 |
75% | 5.000000 | 11.000000 | 106.070000 | 7.000000 | 27.000000 |
max | 6.000000 | 16.000000 | 238.320000 | 39.000000 | 41.000000 |
mean_revenue_by_week = df2.groupby('week')['revenue'].mean().round(2).reset_index()
print(mean_revenue_by_week)
week revenue 0 1 78.01 1 2 85.26 2 3 81.43 3 4 98.73 4 5 107.65 5 6 148.82
import seaborn as sns
import matplotlib.pyplot as plt
sns.barplot(x='week', y='revenue', data=df)
# Add labels to the bars
ax = sns.barplot(x='week', y='revenue', data=df)
for p in ax.patches:
ax.annotate(format(p.get_height(), '.2f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 5),
textcoords = 'offset points')
# Add labels and title
plt.xlabel('Week')
plt.ylabel('Average Revenue')
plt.title('Average Revenue by Week')
# Show the plot
plt.show()
sum_revenue_by_week = df.groupby('week')['revenue'].sum().reset_index()
print(sum_revenue_by_week)
week revenue 0 1 293850.38 1 2 213840.06 2 3 198241.77 3 4 253337.40 4 5 274238.72 5 6 175510.50
from matplotlib.ticker import FuncFormatter
# Aggregate data to get the sum of revenue by week
df_agg = df.groupby('week')['revenue'].sum().reset_index()
# Create the bar plot using Seaborn
ax = sns.barplot(x='week', y='revenue', data=df_agg)
# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
return '{:,.0f}'.format(x)
# Apply the function to the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(format_thousands))
# Add values to the bars
for p in ax.patches:
ax.annotate(format(p.get_height(), ',.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center',
xytext=(0, 5),
textcoords='offset points')
# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by Week')
# Show the plot
plt.show()
from matplotlib.ticker import FuncFormatter
# Aggregate data to get the sum of revenue by week
df_agg = df.groupby('week')['revenue'].sum().reset_index()
# Plot the sum of revenue by week using Seaborn
sns.lineplot(x='week', y='revenue', data=df_agg, marker='o')
# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
return '{:,.0f}'.format(x)
# Apply the function to the y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_thousands))
# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by Week')
# Show the plot
plt.grid(True)
plt.show()
Top 10 states having most orders and most revenue for the company.
state_counts = df['state'].value_counts().reset_index().rename(columns={'index':'state', 'state':'count'})
print(state_counts.head(10))
state count 0 California 1872 1 Texas 1187 2 New York 965 3 Florida 904 4 Illinois 617 5 Pennsylvania 598 6 Ohio 566 7 Michigan 498 8 Georgia 489 9 North Carolina 459
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# Aggregate data to get the sum of revenue by week
df2_agg = df.groupby('state')['revenue'].sum().reset_index().head(10)
df2_agg = df2_agg.sort_values(by='revenue', ascending=False)
print(df2_agg)
# Create the bar plot using Seaborn
ax = sns.barplot(x='state', y='revenue', data=df2_agg)
# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
return '{:,.0f}'.format(x)
# Apply the function to the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(format_thousands))
plt.xticks(rotation=90)
# Add values to the bars
for p in ax.patches:
ax.annotate(format(p.get_height(), ',.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center',
xytext=(0, 5),
textcoords='offset points')
# Add labels and title
plt.xlabel('State')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by State')
# Show the plot
plt.show()
state revenue 4 California 173533.58 8 Florida 84978.29 9 Georgia 46150.38 2 Arizona 29643.36 5 Colorado 21236.22 0 Alabama 20711.02 6 Connecticut 17479.81 3 Arkansas 12110.40 1 Alaska 3669.64 7 Delaware 3438.33
California is the first state bring about most revenue for the company. Let look at the revenue by week of California.
Cali_df = df.query('state == "California"')
Cali_df.head()
week | sales_method | customer_id | nb_sold | revenue | years_as_customer | nb_site_visits | state | |
---|---|---|---|---|---|---|---|---|
13 | 2 | 6103bcac-9da6-4000-a0ce-fa2615cce846 | 10 | 101.54 | 1 | 28 | California | |
42 | 3 | 0fe31def-6ac2-4559-ba9a-23d65078eabf | 9 | 86.51 | 3 | 25 | California | |
46 | 1 | a46ed8d0-f927-4fa8-a1db-19108a236abe | 10 | 95.02 | 4 | 24 | California | |
69 | 4 | f57fcf83-e103-4c21-a110-e943c77a0a03 | 11 | 109.52 | 7 | 20 | California | |
81 | 2 | Call | 7f2244c1-bf0b-4afc-8b43-206a087633d9 | 10 | 48.98 | 1 | 27 | California |
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
Cal_df = df.query('state == "California"')
# Aggregate data to get the sum of revenue by week
Cal_df_agg = Cal_df.groupby('week')['revenue'].sum().reset_index()
# Plot the sum of revenue by week using Seaborn
sns.lineplot(x='week', y='revenue', data=Cal_df_agg, marker='o')
# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
return '{:,.0f}'.format(x)
# Apply the function to the y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_thousands))
# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('California Total Revenue by Week')
# Show the plot
plt.grid(True)
plt.show()