Exploratory data analysis (EDA) of an eCommerce dataset

0.1 Data cleaning, editing and basic profiling
0.2 Data Exploration and Summarization
0.3 Unit Economics

Rationale

The main aim of the project was to perform exploratory data analysis using an eCommerce dataset for the year 2011

Key Performance Indicators (KPI’s) based on Revenue and Customer numbers along with Unit economics like Average Revenue per Customer (ARPU) were calculated in order to observe trends if/any in the dataset

0.1 Data cleaning, editing and basic profiling

Data was cleaned and edited done using DBeaver IDE Scriptand further editing was carried out in RStudio

Number of rows in the dataset


select
count(*) as number_entries
from
ecommerce_data
;

1 records
number_entries
541909

0.2 Data Exploration and Summarization

Exploring and summarizing the revenue generated (KPI) for the year 2011 based on item type and countries Creating a temporary table for generating the revenue summary based on different fields


create temp table revenue_item_type as (
SELECT 
rank () over (order by sum("UnitPrice" * "Quantity")::numeric desc) as item_rank,
initcap("Description") as Item_description,
"Country" as Country,
round(sum("UnitPrice" * "Quantity")::numeric,3) AS revenue
  FROM ecommerce_data
  where "CustomerID" is not null and "InvoiceDate" >= '2011-01-01'
  group by "Description","Country"
  order by revenue desc
  );

0.2.1 Top 10 items generating the maximum revenue


  SELECT 
Item_description as Items, 
revenue as Revenue
  FROM revenue_item_type
  limit 10
  ;

The output of the above sql query is stored in a dataframe titled max_revenue

DT::datatable(max_revenue,
              class = 'cell-border stripe',
              rownames = FALSE,
              caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    htmltools::em('Top 10 items generating the maximum revenue')))

Visualizing the maximum revenue by items

require(plotly)
require(forcats)
max_revenue%>%
    plot_ly(x = ~forcats::fct_reorder(items,revenue),
            y=~revenue,
            type = "bar",
            color = I('brown'))%>%
    layout(title = 'Top 10 items by revenue',
           xaxis = list(title = "Items"),
           yaxis = list(title = "Revenue"))

It could be seen that the the three tiered Regency Cakestand was the item whcih generated the maximum revenue

0.2.2 Top 5 countries generating the maximum revenue


SELECT 
 Country,
 revenue
 from
(select
rank() over (order by sum(revenue) desc) as country_rank,
 Country,
sum(revenue) as revenue
  FROM revenue_item_type
  group by Country
  order by sum(revenue) desc) subq
  limit 5
  ;

The output of the above sql query is stored in a dataframe titled max_revenue_country

DT::datatable(max_revenue_country,
              class = 'cell-border stripe',
              rownames = FALSE,
              caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    htmltools::em('Top 5 countries generating the maximum revenue')))

0.2.3 Visualizing the top 5 countries by revenue generated

#Changing the name of EIRE to Ireland

max_revenue_country[3,]<-c("Ireland",
                           max_revenue_country[3,"revenue"])

# GIS data of the countries is first added to the dataset using rworldmap and rgeos(centroid GIS data) packages for mapping the countries

if(!require(rworldmap))install.packages('rworldmap')

if(!require(rgeos))install.packages('rgeos')

# Obtaining the world map centroids

world_centroids <- rworldmap::getMap(resolution="high")%>%
    rgeos::gCentroid(.,byid=TRUE)%>%
    data.frame(.)%>%
    rownames_to_column(.)%>%
    dplyr::rename(name=rowname)

# Combining the centroid dataset with the revenue data

country_data_gis<- max_revenue_country%>%
    inner_join(world_centroids,
               by = c('country' = 'name'))%>%
    dplyr::rename(lon=x,
                  lat=y)

# For interactivity, the data are converted into a shared data object using the crosstalk package to help in data visualization interactivity

require(crosstalk)

shared_country_gis <- SharedData$new(country_data_gis,
                                     key = ~country)


# Plot 1: Barchart of the top 5 countries by revenue

barchart_country<-shared_country_gis%>%
    plot_ly(x = ~forcats::fct_reorder(country,revenue),
            y=~sqrt(as.numeric(revenue)),
            type = "bar")%>%
    layout(title = 'Top 5 countries by revenue',
           xaxis = list(title = "Countries"),
           yaxis = list(title = "Revenue"))

#Plot 2: Map to visualize the countries using leaflet package

if(!require(leaflet))install.packages('leaflet') 

#Color scheme for the GIS points

colour_combo <- colorFactor(c("orange", "green", "black"),
                            domain = unique(shared_country_gis$country))


# Map

locality_map <- leaflet(shared_country_gis) %>%
    addProviderTiles("Stamen.Terrain") %>%
    addCircleMarkers(
        color = ~colour_combo(country),
        stroke = TRUE, 
        fillOpacity = 1,
        radius=5,
         lng = ~lon, 
        lat = ~lat,
        label = ~as.character(country),
        popup = paste0("<strong>Country: </strong>", country_data_gis$country, "</br>",
                   "<strong>revenue: </strong>", country_data_gis$revenue))%>%
    addTiles()

# Combining the two plots to have interactivity (using crosstalk package)

crosstalk::bscols(widths=c(5,7),
                  barchart_country,
                  locality_map)

Of the 37 total countries in the dataset,European countries generated the maximum revenue

0.2.4 Monthly revenue for the year 2011


 SELECT 
 months_2011,
 delivr_month,
 revenue
 from(
 select
 to_char("InvoiceDate",'FMMonth') as months_2011,
  DATE_TRUNC('month', "InvoiceDate") :: DATE as delivr_month,
  round(sum("UnitPrice" * "Quantity")::numeric,3) AS revenue
 FROM ecommerce_data
 where "InvoiceDate" >= '2011-01-01'
 group by months_2011,delivr_month
 ) subq
order by delivr_month;

Data visualization of the monthly revenue changes

library(purrr)
library(lubridate)
library(forcats)

#plot

monthly_revenue %>%
    dplyr::mutate(month_id=lubridate::month(delivr_month))%>%
    split(.$month_id)%>% 
    accumulate(~bind_rows(.x, .y))%>%
    set_names(1:12) %>%
    bind_rows(.id = "frame")%>%
    plot_ly(x = ~month_id, 
            y = ~revenue,
            text=~months_2011,
            hoverinfo='text') %>%
    add_lines(frame = ~fct_reorder(frame,
                                   month_id))%>%
    animation_slider(currentvalue = list(
        prefix = "Month: ", 
        font = list(color = "black")))%>%
    animation_opts(easing = "linear")%>%
    layout(title = 'Monthly revenue for the year 2011',
           xaxis = list(title = "Months"),
           yaxis = list(title = "Avg_Revenue"))%>%
    hide_legend()

Maximum revenue fluctuated throughout the year but was highest during the period from October until December

0.2.5 Quarterly Revenue for the year 2011


SELECT 
to_char("InvoiceDate",'"Q"Q YYYY') as deliver_quarter,
  round(sum("UnitPrice" * "Quantity")::numeric,3) AS revenue
 FROM ecommerce_data
 where DATE_TRUNC('year',"InvoiceDate")::date  >= '2011-01-01'
 group by deliver_quarter
 having round(sum("UnitPrice" * "Quantity")::numeric,3) > 0
 order by deliver_quarter
 ;

4 records
deliver_quarter	revenue
Q1 2011	1741330
Q2 2011	1907664
Q3 2011	2383668
Q4 2011	2966129

Based on the monthly revenue, it was also seen that the fourth quarter generated the maximum revenue

0.2.6 New Customer registrations per month for the year 2011


WITH reg_dates AS (
  SELECT
    "CustomerID",
    MIN("InvoiceDate")::date AS reg_date
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01'
  GROUP BY "CustomerID"),
  regs AS (
  select
   DATE_TRUNC('month', reg_date) :: DATE as reg_month,
   COUNT(DISTINCT "CustomerID") AS regs
  FROM reg_dates
  GROUP BY reg_month)
SELECT
  to_char(reg_month, 'FMMonth') AS registration_month,
  regs as registrations
FROM regs
ORDER BY extract ('month' from reg_month) ASC;

The output of the above sql query is stored in a dataframe titled customer_reg

DT::datatable(customer_reg,
              class = 'cell-border stripe',
              rownames = FALSE,
              caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left;',
    htmltools::em('New montly Customer registrations per month for the year 2011')))

Data visualization of the above dataset

# To visualize the number of registrations, dataset is modified a little using tidyr package followed by a treemap visualization using treemap package

require(treemap)

customer_reg%>%
    tidyr::unite("Reg_combo",
                 registration_month:registrations,
                 sep=":",
                 remove=FALSE)%>%
    treemap(., 
        index=c("Reg_combo"),  
        vSize = "registrations",  
        type="index", 
        palette = "Set2",  
        title="New monthly customer registrations for 2011", 
        fontsize.title = 14)

Montly registrations saw an opposite trend with maximum new registrations occurring in the first quarter

0.2.7 Monthly Growth in the number of customers


 with mau AS (
  SELECT
    DATE_TRUNC('month', "InvoiceDate") :: DATE AS reg_month,
    COUNT(DISTINCT "CustomerID") AS current_customer_count
  from ecommerce_data
  WHERE "InvoiceDate" >= '2011-01-01'
  GROUP BY reg_month),
  mau_with_lag AS (
  SELECT
    reg_month,
    current_customer_count,
    COALESCE(
      LAG (current_customer_count) over (order by reg_month),
    1) AS previous_customer_count
  FROM mau)
SELECT
reg_month,
current_customer_count as customer_no,
    TO_CHAR(reg_month,'FMMonth') as month, 
   (current_customer_count -  previous_customer_count)::numeric/ previous_customer_count AS growth_rate
FROM mau_with_lag
ORDER BY reg_month;

Visualization of growth rate number of customers

#Editing the data for the visualization

growth_customers[1,4]<-0

# Shared data object for a joint visualization

growth_customers_shared<-growth_customers%>%
    dplyr::mutate(month_id=lubridate::month(reg_month))%>%
    split(.$month_id)%>% 
    accumulate(~bind_rows(.x, .y))%>%
    set_names(1:12) %>%
    bind_rows(.id = "frame")
  
growth_plot<-growth_customers_shared%>%
    plot_ly(x = ~month_id,
            y = ~growth_rate,
            text=~month,
            hoverinfo='text',
            color = "black") %>%
    add_lines( line = list(color = 'black', width = 4),
               frame = ~fct_reorder(frame,
                                           month_id))%>%
  animation_slider(currentvalue = list(
    prefix = "Month: ", 
    font = list(color = "black")))%>%
  animation_opts(easing = "linear")%>%
    layout(title = 'growth rate in customer registrations',
           xaxis = list(title = "Months"),
           yaxis = list(title = "Rate"))%>%
    hide_legend()

growth_plot

0.3 Unit Economics

0.3.1 Overall average revenue per customer


with avg_rev_customer as  (
  SELECT
    "CustomerID",
    SUM("UnitPrice" * "Quantity") AS revenue
  FROM ecommerce_data
  where "CustomerID" is not null and "InvoiceDate" >= '2011-01-01'
  GROUP BY "CustomerID")
SELECT ROUND(avg(revenue) :: numeric, 2) AS avg_revenue_per_customer
FROM avg_rev_customer;

1 records
avg_revenue_per_customer
1825.04

0.3.2 Average revenue per customer for each month in the year 2011


WITH avg_rev_cust_month AS (
  SELECT
    DATE_TRUNC('month', "InvoiceDate") :: DATE AS month,
    SUM("UnitPrice" * "Quantity") AS revenue,
    COUNT(DISTINCT "CustomerID") AS users
  from ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  group by month)
SELECT
month,
 to_char(month, 'FMMonth') as month_2011,
 ROUND(
    revenue :: NUMERIC / GREATEST(users, 1),
  2) AS avg_revenue_customer
FROM avg_rev_cust_month
ORDER BY month ASC;

Data Visualization of the ARPU

monthly_arpu%>%
  mutate(month_id=month(month))%>%
  plot_ly(x = ~forcats::fct_reorder(month_2011,
                                    month_id),
          y=~avg_revenue_customer,
          type = "bar",
          color = 'brown',
          marker = list(color = 'orange',
                        line = list(color = 'black',
                                    width = 1.5)))%>%
  layout(title = 'Avgerage revenue per customer for 2011',
         xaxis = list(title = "Months"),
         yaxis = list(title = "Avg revenue"))

Average revenue generated per customer was higher in the 3rd and 4th quarters with maximum value obtained in September

0.3.3 Number of customers with their corresponding number of orders


with customer_orders as (
 SELECT
    "CustomerID",
    COUNT(DISTINCT "InvoiceNo") AS number_of_orders
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  GROUP BY "CustomerID"
  order by "CustomerID"
  )
  select
  number_of_orders,
  count(distinct "CustomerID") as customer_number
  from
 customer_orders
  group by number_of_orders
  order by number_of_orders
   ;

Data Visualization of customer counts with corresponding number of orders

customer_order_nos%>%
  plot_ly(x = ~number_of_orders,
          y=~customer_number,
        type = "bar",
        color=I("grey50"),
        hoverinfo = 'text',
        text=~paste("Number of orders:",number_of_orders, "<br>",
                    "Customer counts:",customer_number))%>%
  layout(title = 'Customer counts corresponding to number of orders',
         xaxis = list(title = "Number of orders"),
         yaxis = list(title = "Customer counts"))

Most of the customers ordered between 1 to 20 items with only 1-2 customers having ordered more than 100 different items.

0.3.4 Observing Customer trends based on revenue (by bucketing)


-- For generating buckets, revenue distribution trends are first observed by using some aggregate measures

    --a1. Generating a temporary table 

drop table if exists customer_revenues;

create temp table customer_revenues as (
  SELECT
   "CustomerID" as customer_id,
     SUM("UnitPrice" * "Quantity") AS revenue
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  GROUP BY "CustomerID");

  
  --a2. Obtaining some aggregate measures for generating buckets

  select
  round(min(revenue)::numeric,2) as min_revenue,
  round(percentile_cont(0.50) within group (order by revenue):: numeric,2) as median_revenue,
  round(avg(revenue)::numeric,2) as avg_revenue,
  round(max(revenue)::numeric,2) as max_revenue
  from
  customer_revenues;  

  --a3. Three buckets are created based on the aggregates obtained above 

SELECT
  CASE
    WHEN revenue <= 672 THEN 'Low-revenue users'
    WHEN revenue >= 672.1 and revenue <= 2110 THEN 'Mid-revenue users'
    when revenue >= 2110.1 then 'High-revenue users'
    END AS revenue_group,
  count (distinct customer_id) as customers
FROM customer_revenues
GROUP BY revenue_group
order by customers desc;

3 records
revenue_group	customers
Low-revenue users	2193
Mid-revenue users	1283
High-revenue users	768

0.3.5 Observing Customer trends based on order (numbers) (by bucketing)


--a1. Creating a temporary table 

drop table if exists customer_orders;

create temp table customer_orders as (
 SELECT
    "CustomerID" as customer_id,
    COUNT(DISTINCT "InvoiceNo") AS orders_count
  FROM ecommerce_data
  where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
  GROUP BY "CustomerID"
  order by "CustomerID"
  );
  
 --a2. Obtaining some aggregate measures for generating buckets

  select
  round(min(orders_count)::numeric,2) as min_count,
  round(percentile_cont(0.50) within group (order by orders_count):: numeric,2) as median_count,
  round(percentile_cont(0.75) within group (order by orders_count):: numeric,2) as seventyfive_percentile_count,
  round(avg(orders_count)::numeric,2) as avg_count,
  round(max(orders_count)::numeric,2) as max_count
  from
  customer_orders;   
  
 
   --a3. Three buckets are created based on the aggregates obtained above 
  
SELECT
  CASE
    WHEN orders_count <= 3 THEN 'Low-order users'
    WHEN orders_count > 3 and orders_count <= 10   THEN 'Mid-order users'
    ELSE 'High-order users'
  END AS user_categories,
  count (distinct customer_id) AS customer_number
FROM customer_orders
GROUP BY user_categories
order by customer_number desc;

3 records
user_categories	customer_number
Low-order users	2596
Mid-order users	1242
High-order users	406

The number of customers decreased based on the order numbers as seen from the histogram as well

This work is still in progress and will be updated from time to time