import os
os.environ['QT_QPA_PLATFORM_PLUGIN_PATH'] = 'C:/ProgramData/Anaconda3/Library/plugins/platforms'
import numpy
import pandas
import matplotlib
import seaborn

Analysis of Amazon Sales Data

This dataset is comprised of Amazon sales data, with information regarding transaction date, method of purchase, order information, description of the product sold, as well as information regarding the customer’s region. Overall, it was a very interesting set of data and the charts below can be utilized to help Amazon decision makers on where they are succeeding and where they need improvement. Check out the visualizations below to see a variety of information that can be used to create actionable business decision.

First is a Vertical Bar Chart

In this bar chart, we can see in ascending fashion, the number of transactions executed by month. It is clear that there is not a large discrepancy between their highest and lowest performing months, but they can improve by finding ways to increase sales in February & April. August and January are doing great, and are their highest performing months by number of transactions.

import pandas as pd
import numpy as np
import wget as wg
import matplotlib.pyplot as plt
df = pd.read_csv("cleaned_dataset[1].csv")
df['order_date'] = pd.to_datetime(df['order_date'], format = '%Y/%m/%d')
df['Year'] = df['order_date'].dt.year
df['Quarter'] = df['order_date'].dt.quarter
df['Month'] = df['order_date'].dt.month
df['MonthName'] = df['order_date'].dt.strftime('%b')
df['DayName'] = df['order_date'].dt.strftime('%a')
x = df.groupby(['MonthName']).agg({'MonthName':['count']}).reset_index()
x.columns = ['Month', 'Count']
x =x.sort_values('Count', ascending = True)
plt.figure(figsize=(18,10))
plt.bar(x['Month'], x['Count'], label = 'Number of Monthly Transactions', color = 'green')
plt.legend(loc = 'upper right', fontsize = 14)
plt.title("Transactions Executed By Month (Ascending)")
plt.xlabel("Month", fontsize = 14)
plt.ylabel("Number of Transactions", fontsize = 14)
plt.xticks(fontsize = 14)
plt.show()

## Number 2: Horizontal Bar Chart Examining Most Popular Product Categories In this chart, we can look at which product categories perform the best by transactions per category. Across the 6 categories, it is clear that Beauty and Fashion perform the best, whereas Home & Kitchen and Electronics are not as popular.

x3 = df.groupby(['product_category']).agg({'product_category':['count']}).reset_index()
x3.columns = ['product_category', 'Count']
plt.figure(figsize=(18,10))
plt.barh(x3['product_category'], x3['Count'], label = 'Number of transactions by product category', color = 'teal')
plt.legend(loc='upper right', fontsize = 10)
plt.title("Transactions executed per product category")
plt.xlabel("Total Transactions", fontsize = 12)
plt.ylabel("Category", fontsize = 12)
plt.yticks(fontsize = 14)
## ([0, 1, 2, 3, 4, 5], [Text(0, 0, 'Beauty'), Text(0, 1, 'Books'), Text(0, 2, 'Electronics'), Text(0, 3, 'Fashion'), Text(0, 4, 'Home & Kitchen'), Text(0, 5, 'Sports')])
plt.show()

##Stacked Bar Plot Examining Revenue by Category Across Customer Regions In this chart, we see the crosssection of revenue gained by category across various regions. We see that Asia and the Middleeast have the overall highest performace in revenue. Within these categories it is evident that Beauty and Home & Kitchen perform well. In Europe, the lowest performing category is Home & Beauty, and in North America, it is Electronics.

stacked_df = df.groupby(['customer_region','product_category'])['total_revenue'].sum().reset_index(name='Revenue By Region and Category')
stacked_df = stacked_df.pivot(index='customer_region', columns='product_category', values='Revenue By Region and Category')
fig = plt.figure(figsize = (18,10))
ax = fig.add_subplot(1,1,1)

stacked_df.plot(kind='bar', stacked=True, ax=ax)
plt.ylabel('Total Revenue', fontsize=18, labelpad=10)
plt.title('Total Revenue By Category Across Customer Regions \nStacked Bar Plot', fontsize=18)
plt.xticks(rotation=0, horizontalalignment = 'center', fontsize=14)
## (array([0, 1, 2, 3]), [Text(0, 0, 'Asia'), Text(1, 0, 'Europe'), Text(2, 0, 'Middle East'), Text(3, 0, 'North America')])
plt.yticks(fontsize=14)
## (array([      0., 1000000., 2000000., 3000000., 4000000., 5000000.,
##        6000000., 7000000., 8000000., 9000000.]), [Text(0, 0.0, '0'), Text(0, 1000000.0, '1'), Text(0, 2000000.0, '2'), Text(0, 3000000.0, '3'), Text(0, 4000000.0, '4'), Text(0, 5000000.0, '5'), Text(0, 6000000.0, '6'), Text(0, 7000000.0, '7'), Text(0, 8000000.0, '8'), Text(0, 9000000.0, '9')])
ax.set_xlabel('Region', fontsize=18)

from matplotlib.ticker import FuncFormatter
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: ('$%1.1fM')%(x*1e-6)))
plt.show()

##Line Graph Examining Monthly Revenue of 2023 In this visualization, we have a line graph that examines the monthly revenue earned in the year 2023, the most recent year in the dataset. It peaks in January, and falls quickly in February. It rebounds around 2.5 million monthly throughout the rest of the year, and peaks again in August at approximately 2.8 million.

df_2023= df[df['Year']== 2023]
df_2023=df.groupby(['Month'])['total_revenue'].sum().reset_index(name='MonthlyRevenue')
fig = plt.figure(figsize = (18,10))
ax = fig.add_subplot(1,1,1)

df_2023.plot(ax=ax, kind='line', x='Month', y='MonthlyRevenue', color='blue', marker = 'o')
plt.title('Revenue by Month (2023)', fontsize = 20)
plt.ylabel('Revenue', fontsize = 14)
plt.xlabel('Month of the Year', fontsize =14)
ax.tick_params(axis='y', labelsize=12)
ax.tick_params(axis='x', labelsize=12)

from matplotlib.ticker import FuncFormatter
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: ('$%1.1fM')%(x*1e-6)))
plt.show()

##Number 5: Pie Chart Examining Method of Payment by Revenue Share In this graph, we see a relatively even spread of methods of payment across the five options by amount of total revenue earned. The highest earning method was Wallet, followed by UPI, then Cash on Delivery, Credit Card, and Debit Card being the least profitable.

paymentmethod_df = df.groupby(['payment_method'])['total_revenue'].sum().reset_index(name='TotalRevenue')
number_outside_colors=len(paymentmethod_df.payment_method.unique())
outside_color_ref_number = np.arange(number_outside_colors)*3
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1,1,1)

colormap=plt.get_cmap("tab20c")
outer_colors=colormap(outside_color_ref_number)

all_revenue = paymentmethod_df.TotalRevenue.sum()

paymentmethod_df.groupby(['payment_method'])['TotalRevenue'].sum().plot(
    kind='pie',radius=1,colors= outer_colors, pctdistance = .6, labeldistance = 1.2, wedgeprops = dict(edgecolor='w'),
    textprops={'fontsize':18}, autopct = lambda p:'{:.2f}%\n(${:.2}M)'.format(p,(p/100)*all_revenue/1e6),
    startangle=90)
plt.title('Revenue By Method of Payment', fontsize=16)
plt.show()