import pandas as pd
df= pd.read_csv("/Users/nnthieu/sales.csv", skiprows=0)


df.shape

(15000, 8)


df.head()


print(df.columns)

Index(['week', 'sales_method', 'customer_id', 'nb_sold', 'revenue',
       'years_as_customer', 'nb_site_visits', 'state'],
      dtype='object')


df.describe()


df['sales_method'].unique()

array(['Email', 'Email + Call', 'Call', 'em + call', 'email'],
      dtype=object)


repl_dict = {'em + call': 'Email + Call', 'email': 'Email'}
df['sales_method'] = df['sales_method'].replace(repl_dict)
df["sales_method"].unique()

array(['Email', 'Email + Call', 'Call'], dtype=object)


df['revenue']=df['revenue'].fillna(93.93)
df.head()


df['years_as_customer'] = df['years_as_customer'].apply(lambda x: 5 if x > 40 else x)


revenue_by_method = df.groupby('sales_method')['revenue'].sum()
print(revenue_by_method)

sales_method
Call            244564.82
Email           723415.75
Email + Call    441038.26
Name: revenue, dtype: float64


sales_method_counts = df['sales_method'].value_counts()
print(sales_method_counts)

Email           7466
Call            4962
Email + Call    2572
Name: sales_method, dtype: int64


df.describe()


mean_revenue_by_week = df2.groupby('week')['revenue'].mean().round(2).reset_index()
print(mean_revenue_by_week)

   week  revenue
0     1    78.01
1     2    85.26
2     3    81.43
3     4    98.73
4     5   107.65
5     6   148.82


import seaborn as sns
import matplotlib.pyplot as plt

sns.barplot(x='week', y='revenue', data=df)

# Add labels to the bars
ax = sns.barplot(x='week', y='revenue', data=df)
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.2f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 5), 
                textcoords = 'offset points')

# Add labels and title
plt.xlabel('Week')
plt.ylabel('Average Revenue')
plt.title('Average Revenue by Week')

# Show the plot
plt.show()


sum_revenue_by_week = df.groupby('week')['revenue'].sum().reset_index()
print(sum_revenue_by_week)

   week    revenue
0     1  293850.38
1     2  213840.06
2     3  198241.77
3     4  253337.40
4     5  274238.72
5     6  175510.50


from matplotlib.ticker import FuncFormatter

# Aggregate data to get the sum of revenue by week
df_agg = df.groupby('week')['revenue'].sum().reset_index()

# Create the bar plot using Seaborn
ax = sns.barplot(x='week', y='revenue', data=df_agg)

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(format_thousands))


# Add values to the bars
for p in ax.patches:
    ax.annotate(format(p.get_height(), ',.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by Week')

# Show the plot
plt.show()


from matplotlib.ticker import FuncFormatter

# Aggregate data to get the sum of revenue by week
df_agg = df.groupby('week')['revenue'].sum().reset_index()

# Plot the sum of revenue by week using Seaborn
sns.lineplot(x='week', y='revenue', data=df_agg, marker='o')

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_thousands))


# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by Week')

# Show the plot
plt.grid(True)
plt.show()


state_counts = df['state'].value_counts().reset_index().rename(columns={'index':'state', 'state':'count'})
print(state_counts.head(10))

            state  count
0      California   1872
1           Texas   1187
2        New York    965
3         Florida    904
4        Illinois    617
5    Pennsylvania    598
6            Ohio    566
7        Michigan    498
8         Georgia    489
9  North Carolina    459


import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Aggregate data to get the sum of revenue by week
df2_agg = df.groupby('state')['revenue'].sum().reset_index().head(10)

df2_agg = df2_agg.sort_values(by='revenue', ascending=False)
print(df2_agg)

# Create the bar plot using Seaborn
ax = sns.barplot(x='state', y='revenue', data=df2_agg)

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
ax.yaxis.set_major_formatter(FuncFormatter(format_thousands))
plt.xticks(rotation=90)

# Add values to the bars
for p in ax.patches:
    ax.annotate(format(p.get_height(), ',.0f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5), 
                textcoords='offset points')

# Add labels and title
plt.xlabel('State')
plt.ylabel('Total Revenue')
plt.title('Total Revenue by State')

# Show the plot
plt.show()

         state    revenue
4   California  173533.58
8      Florida   84978.29
9      Georgia   46150.38
2      Arizona   29643.36
5     Colorado   21236.22
0      Alabama   20711.02
6  Connecticut   17479.81
3     Arkansas   12110.40
1       Alaska    3669.64
7     Delaware    3438.33


Cali_df = df.query('state == "California"')
Cali_df.head()


import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
Cal_df = df.query('state == "California"')

# Aggregate data to get the sum of revenue by week
Cal_df_agg = Cal_df.groupby('week')['revenue'].sum().reset_index()

# Plot the sum of revenue by week using Seaborn
sns.lineplot(x='week', y='revenue', data=Cal_df_agg, marker='o')

# Function to format y-axis ticks with commas for thousands
def format_thousands(x, pos):
    return '{:,.0f}'.format(x)

# Apply the function to the y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_thousands))


# Add labels and title
plt.xlabel('Week')
plt.ylabel('Total Revenue')
plt.title('California Total Revenue by Week')

# Show the plot
plt.grid(True)
plt.show()

	week	nb_sold	revenue	years_as_customer	nb_site_visits
count	15000.000000	15000.000000	13926.000000	15000.000000	15000.000000
mean	3.098267	10.084667	93.934943	4.965933	24.990867
std	1.656420	1.812213	47.435312	5.044952	3.500914
min	1.000000	7.000000	32.540000	0.000000	12.000000
25%	2.000000	9.000000	52.470000	1.000000	23.000000
50%	3.000000	10.000000	89.500000	3.000000	25.000000
75%	5.000000	11.000000	107.327500	7.000000	27.000000
max	6.000000	16.000000	238.320000	63.000000	41.000000

	week	nb_sold	revenue	years_as_customer	nb_site_visits
count	15000.000000	15000.000000	15000.000000	15000.000000	15000.000000
mean	3.098267	10.084667	93.934589	4.959267	24.990867
std	1.656420	1.812213	45.705474	5.010903	3.500914
min	1.000000	7.000000	32.540000	0.000000	12.000000
25%	2.000000	9.000000	53.040000	1.000000	23.000000
50%	3.000000	10.000000	91.865000	3.000000	25.000000
75%	5.000000	11.000000	106.070000	7.000000	27.000000
max	6.000000	16.000000	238.320000	39.000000	41.000000

Product Sales Analyzed Using Python¶

Loading and checking data¶

Cleaning data¶

Summarizing data¶

Charts¶

	week	sales_method	customer_id	nb_sold	revenue	years_as_customer	nb_site_visits	state
0	2	Email	2e72d641-95ac-497b-bbf8-4861764a7097	10	NaN	0	24	Arizona
1	6	Email + Call	3998a98d-70f5-44f7-942e-789bb8ad2fe7	15	225.47	1	28	Kansas
2	5	Call	d1de9884-8059-4065-b10f-86eef57e4a44	11	52.55	6	26	Wisconsin
3	4	Email	78aa75a4-ffeb-4817-b1d0-2f030783c5d7	11	NaN	3	25	Indiana
4	3	Email	10e6d446-10a5-42e5-8210-1b5438f70922	9	90.49	0	28	Illinois

	week	sales_method	customer_id	nb_sold	revenue	years_as_customer	nb_site_visits	state
0	2	Email	2e72d641-95ac-497b-bbf8-4861764a7097	10	93.93	0	24	Arizona
1	6	Email + Call	3998a98d-70f5-44f7-942e-789bb8ad2fe7	15	225.47	1	28	Kansas
2	5	Call	d1de9884-8059-4065-b10f-86eef57e4a44	11	52.55	6	26	Wisconsin
3	4	Email	78aa75a4-ffeb-4817-b1d0-2f030783c5d7	11	93.93	3	25	Indiana
4	3	Email	10e6d446-10a5-42e5-8210-1b5438f70922	9	90.49	0	28	Illinois

	week	sales_method	customer_id	nb_sold	revenue	years_as_customer	nb_site_visits	state
13	2	Email	6103bcac-9da6-4000-a0ce-fa2615cce846	10	101.54	1	28	California
42	3	Email	0fe31def-6ac2-4559-ba9a-23d65078eabf	9	86.51	3	25	California
46	1	Email	a46ed8d0-f927-4fa8-a1db-19108a236abe	10	95.02	4	24	California
69	4	Email	f57fcf83-e103-4c21-a110-e943c77a0a03	11	109.52	7	20	California
81	2	Call	7f2244c1-bf0b-4afc-8b43-206a087633d9	10	48.98	1	27	California