Rationale
The main aim of the project was to perform exploratory data analysis using an eCommerce dataset for the year 2011
Key Performance Indicators (KPI’s) based on Revenue and Customer numbers along with Unit economics like Average Revenue per Customer (ARPU) were calculated in order to observe trends if/any in the dataset
Data was cleaned and edited done using DBeaver IDE Scriptand further editing was carried out in RStudio
Number of rows in the dataset
| number_entries |
|---|
| 541909 |
Exploring and summarizing the revenue generated (KPI) for the year 2011 based on item type and countries Creating a temporary table for generating the revenue summary based on different fields
create temp table revenue_item_type as (
SELECT
rank () over (order by sum("UnitPrice" * "Quantity")::numeric desc) as item_rank,
initcap("Description") as Item_description,
"Country" as Country,
round(sum("UnitPrice" * "Quantity")::numeric,3) AS revenue
FROM ecommerce_data
where "CustomerID" is not null and "InvoiceDate" >= '2011-01-01'
group by "Description","Country"
order by revenue desc
);The output of the above sql query is stored in a dataframe titled max_revenue
DT::datatable(max_revenue,
class = 'cell-border stripe',
rownames = FALSE,
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
htmltools::em('Top 10 items generating the maximum revenue')))Visualizing the maximum revenue by items
It could be seen that the the three tiered Regency Cakestand was the item whcih generated the maximum revenue
SELECT
Country,
revenue
from
(select
rank() over (order by sum(revenue) desc) as country_rank,
Country,
sum(revenue) as revenue
FROM revenue_item_type
group by Country
order by sum(revenue) desc) subq
limit 5
;The output of the above sql query is stored in a dataframe titled max_revenue_country
#Changing the name of EIRE to Ireland
max_revenue_country[3,]<-c("Ireland",
max_revenue_country[3,"revenue"])
# GIS data of the countries is first added to the dataset using rworldmap and rgeos(centroid GIS data) packages for mapping the countries
if(!require(rworldmap))install.packages('rworldmap')
if(!require(rgeos))install.packages('rgeos')
# Obtaining the world map centroids
world_centroids <- rworldmap::getMap(resolution="high")%>%
rgeos::gCentroid(.,byid=TRUE)%>%
data.frame(.)%>%
rownames_to_column(.)%>%
dplyr::rename(name=rowname)
# Combining the centroid dataset with the revenue data
country_data_gis<- max_revenue_country%>%
inner_join(world_centroids,
by = c('country' = 'name'))%>%
dplyr::rename(lon=x,
lat=y)
# For interactivity, the data are converted into a shared data object using the crosstalk package to help in data visualization interactivity
require(crosstalk)
shared_country_gis <- SharedData$new(country_data_gis,
key = ~country)
# Plot 1: Barchart of the top 5 countries by revenue
barchart_country<-shared_country_gis%>%
plot_ly(x = ~forcats::fct_reorder(country,revenue),
y=~sqrt(as.numeric(revenue)),
type = "bar")%>%
layout(title = 'Top 5 countries by revenue',
xaxis = list(title = "Countries"),
yaxis = list(title = "Revenue"))
#Plot 2: Map to visualize the countries using leaflet package
if(!require(leaflet))install.packages('leaflet')
#Color scheme for the GIS points
colour_combo <- colorFactor(c("orange", "green", "black"),
domain = unique(shared_country_gis$country))
# Map
locality_map <- leaflet(shared_country_gis) %>%
addProviderTiles("Stamen.Terrain") %>%
addCircleMarkers(
color = ~colour_combo(country),
stroke = TRUE,
fillOpacity = 1,
radius=5,
lng = ~lon,
lat = ~lat,
label = ~as.character(country),
popup = paste0("<strong>Country: </strong>", country_data_gis$country, "</br>",
"<strong>revenue: </strong>", country_data_gis$revenue))%>%
addTiles()
# Combining the two plots to have interactivity (using crosstalk package)
crosstalk::bscols(widths=c(5,7),
barchart_country,
locality_map)
SELECT
months_2011,
delivr_month,
revenue
from(
select
to_char("InvoiceDate",'FMMonth') as months_2011,
DATE_TRUNC('month', "InvoiceDate") :: DATE as delivr_month,
round(sum("UnitPrice" * "Quantity")::numeric,3) AS revenue
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01'
group by months_2011,delivr_month
) subq
order by delivr_month;Data visualization of the monthly revenue changes
library(purrr)
library(lubridate)
library(forcats)
#plot
monthly_revenue %>%
dplyr::mutate(month_id=lubridate::month(delivr_month))%>%
split(.$month_id)%>%
accumulate(~bind_rows(.x, .y))%>%
set_names(1:12) %>%
bind_rows(.id = "frame")%>%
plot_ly(x = ~month_id,
y = ~revenue,
text=~months_2011,
hoverinfo='text') %>%
add_lines(frame = ~fct_reorder(frame,
month_id))%>%
animation_slider(currentvalue = list(
prefix = "Month: ",
font = list(color = "black")))%>%
animation_opts(easing = "linear")%>%
layout(title = 'Monthly revenue for the year 2011',
xaxis = list(title = "Months"),
yaxis = list(title = "Avg_Revenue"))%>%
hide_legend() Maximum revenue fluctuated throughout the year but was highest during the period from October until December
SELECT
to_char("InvoiceDate",'"Q"Q YYYY') as deliver_quarter,
round(sum("UnitPrice" * "Quantity")::numeric,3) AS revenue
FROM ecommerce_data
where DATE_TRUNC('year',"InvoiceDate")::date >= '2011-01-01'
group by deliver_quarter
having round(sum("UnitPrice" * "Quantity")::numeric,3) > 0
order by deliver_quarter
;| deliver_quarter | revenue |
|---|---|
| Q1 2011 | 1741330 |
| Q2 2011 | 1907664 |
| Q3 2011 | 2383668 |
| Q4 2011 | 2966129 |
Based on the monthly revenue, it was also seen that the fourth quarter generated the maximum revenue
WITH reg_dates AS (
SELECT
"CustomerID",
MIN("InvoiceDate")::date AS reg_date
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01'
GROUP BY "CustomerID"),
regs AS (
select
DATE_TRUNC('month', reg_date) :: DATE as reg_month,
COUNT(DISTINCT "CustomerID") AS regs
FROM reg_dates
GROUP BY reg_month)
SELECT
to_char(reg_month, 'FMMonth') AS registration_month,
regs as registrations
FROM regs
ORDER BY extract ('month' from reg_month) ASC; The output of the above sql query is stored in a dataframe titled customer_reg
DT::datatable(customer_reg,
class = 'cell-border stripe',
rownames = FALSE,
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;',
htmltools::em('New montly Customer registrations per month for the year 2011')))Data visualization of the above dataset
# To visualize the number of registrations, dataset is modified a little using tidyr package followed by a treemap visualization using treemap package
require(treemap)
customer_reg%>%
tidyr::unite("Reg_combo",
registration_month:registrations,
sep=":",
remove=FALSE)%>%
treemap(.,
index=c("Reg_combo"),
vSize = "registrations",
type="index",
palette = "Set2",
title="New monthly customer registrations for 2011",
fontsize.title = 14)Montly registrations saw an opposite trend with maximum new registrations occurring in the first quarter
with mau AS (
SELECT
DATE_TRUNC('month', "InvoiceDate") :: DATE AS reg_month,
COUNT(DISTINCT "CustomerID") AS current_customer_count
from ecommerce_data
WHERE "InvoiceDate" >= '2011-01-01'
GROUP BY reg_month),
mau_with_lag AS (
SELECT
reg_month,
current_customer_count,
COALESCE(
LAG (current_customer_count) over (order by reg_month),
1) AS previous_customer_count
FROM mau)
SELECT
reg_month,
current_customer_count as customer_no,
TO_CHAR(reg_month,'FMMonth') as month,
(current_customer_count - previous_customer_count)::numeric/ previous_customer_count AS growth_rate
FROM mau_with_lag
ORDER BY reg_month;Visualization of growth rate number of customers
#Editing the data for the visualization
growth_customers[1,4]<-0
# Shared data object for a joint visualization
growth_customers_shared<-growth_customers%>%
dplyr::mutate(month_id=lubridate::month(reg_month))%>%
split(.$month_id)%>%
accumulate(~bind_rows(.x, .y))%>%
set_names(1:12) %>%
bind_rows(.id = "frame")
growth_plot<-growth_customers_shared%>%
plot_ly(x = ~month_id,
y = ~growth_rate,
text=~month,
hoverinfo='text',
color = "black") %>%
add_lines( line = list(color = 'black', width = 4),
frame = ~fct_reorder(frame,
month_id))%>%
animation_slider(currentvalue = list(
prefix = "Month: ",
font = list(color = "black")))%>%
animation_opts(easing = "linear")%>%
layout(title = 'growth rate in customer registrations',
xaxis = list(title = "Months"),
yaxis = list(title = "Rate"))%>%
hide_legend()
growth_plot
with avg_rev_customer as (
SELECT
"CustomerID",
SUM("UnitPrice" * "Quantity") AS revenue
FROM ecommerce_data
where "CustomerID" is not null and "InvoiceDate" >= '2011-01-01'
GROUP BY "CustomerID")
SELECT ROUND(avg(revenue) :: numeric, 2) AS avg_revenue_per_customer
FROM avg_rev_customer;| avg_revenue_per_customer |
|---|
| 1825.04 |
WITH avg_rev_cust_month AS (
SELECT
DATE_TRUNC('month', "InvoiceDate") :: DATE AS month,
SUM("UnitPrice" * "Quantity") AS revenue,
COUNT(DISTINCT "CustomerID") AS users
from ecommerce_data
where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
group by month)
SELECT
month,
to_char(month, 'FMMonth') as month_2011,
ROUND(
revenue :: NUMERIC / GREATEST(users, 1),
2) AS avg_revenue_customer
FROM avg_rev_cust_month
ORDER BY month ASC;Data Visualization of the ARPU
monthly_arpu%>%
mutate(month_id=month(month))%>%
plot_ly(x = ~forcats::fct_reorder(month_2011,
month_id),
y=~avg_revenue_customer,
type = "bar",
color = 'brown',
marker = list(color = 'orange',
line = list(color = 'black',
width = 1.5)))%>%
layout(title = 'Avgerage revenue per customer for 2011',
xaxis = list(title = "Months"),
yaxis = list(title = "Avg revenue"))Average revenue generated per customer was higher in the 3rd and 4th quarters with maximum value obtained in September
with customer_orders as (
SELECT
"CustomerID",
COUNT(DISTINCT "InvoiceNo") AS number_of_orders
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
GROUP BY "CustomerID"
order by "CustomerID"
)
select
number_of_orders,
count(distinct "CustomerID") as customer_number
from
customer_orders
group by number_of_orders
order by number_of_orders
;
Data Visualization of customer counts with corresponding number of orders
customer_order_nos%>%
plot_ly(x = ~number_of_orders,
y=~customer_number,
type = "bar",
color=I("grey50"),
hoverinfo = 'text',
text=~paste("Number of orders:",number_of_orders, "<br>",
"Customer counts:",customer_number))%>%
layout(title = 'Customer counts corresponding to number of orders',
xaxis = list(title = "Number of orders"),
yaxis = list(title = "Customer counts"))Most of the customers ordered between 1 to 20 items with only 1-2 customers having ordered more than 100 different items.
-- For generating buckets, revenue distribution trends are first observed by using some aggregate measures
--a1. Generating a temporary table
drop table if exists customer_revenues;
create temp table customer_revenues as (
SELECT
"CustomerID" as customer_id,
SUM("UnitPrice" * "Quantity") AS revenue
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
GROUP BY "CustomerID");
--a2. Obtaining some aggregate measures for generating buckets
select
round(min(revenue)::numeric,2) as min_revenue,
round(percentile_cont(0.50) within group (order by revenue):: numeric,2) as median_revenue,
round(avg(revenue)::numeric,2) as avg_revenue,
round(max(revenue)::numeric,2) as max_revenue
from
customer_revenues;
--a3. Three buckets are created based on the aggregates obtained above
SELECT
CASE
WHEN revenue <= 672 THEN 'Low-revenue users'
WHEN revenue >= 672.1 and revenue <= 2110 THEN 'Mid-revenue users'
when revenue >= 2110.1 then 'High-revenue users'
END AS revenue_group,
count (distinct customer_id) as customers
FROM customer_revenues
GROUP BY revenue_group
order by customers desc;| revenue_group | customers |
|---|---|
| Low-revenue users | 2193 |
| Mid-revenue users | 1283 |
| High-revenue users | 768 |
--a1. Creating a temporary table
drop table if exists customer_orders;
create temp table customer_orders as (
SELECT
"CustomerID" as customer_id,
COUNT(DISTINCT "InvoiceNo") AS orders_count
FROM ecommerce_data
where "InvoiceDate" >= '2011-01-01' and "CustomerID" is not null
GROUP BY "CustomerID"
order by "CustomerID"
);
--a2. Obtaining some aggregate measures for generating buckets
select
round(min(orders_count)::numeric,2) as min_count,
round(percentile_cont(0.50) within group (order by orders_count):: numeric,2) as median_count,
round(percentile_cont(0.75) within group (order by orders_count):: numeric,2) as seventyfive_percentile_count,
round(avg(orders_count)::numeric,2) as avg_count,
round(max(orders_count)::numeric,2) as max_count
from
customer_orders;
--a3. Three buckets are created based on the aggregates obtained above
SELECT
CASE
WHEN orders_count <= 3 THEN 'Low-order users'
WHEN orders_count > 3 and orders_count <= 10 THEN 'Mid-order users'
ELSE 'High-order users'
END AS user_categories,
count (distinct customer_id) AS customer_number
FROM customer_orders
GROUP BY user_categories
order by customer_number desc;| user_categories | customer_number |
|---|---|
| Low-order users | 2596 |
| Mid-order users | 1242 |
| High-order users | 406 |
The number of customers decreased based on the order numbers as seen from the histogram as well
This work is still in progress and will be updated from time to time