The goal of this task is to conduct an Exploratory Data Analysis (EDA) on data provided by Idealo.
The data is stored on a AWS in the following base tables.
Brands
products
categories
shops
product_conversions_by_shop_by_day
brand_traffic_by_week
product_traffic_by_day
Theese libraries are used throught the code. Libraries specific to the line of code are defined with the code.
library(ggplot2) # Used to Create Visualization
library(dplyr) # Used to Manipulate and Clean Data
library(reshape2) # Used to Manipulate Data for Plotting
library(DBI) # Used to query DB
Creating a Connection to the Database to Enable Postgres Queries.
db <- 'db_30444' #provide the name of your db
host_db <- 'candidates-test-do-not-delete.cj0lhhdrhdo9.eu-central-1.rds.amazonaws.com'
db_port <- '5432' # or any other port specified by the DBA
db_user <- 'roshan_kumar_bhuyan'
db_password <- 'AEM~Jh]?!L}t2gh('
con <- dbConnect(RPostgres::Postgres(), dbname = db, host=host_db, port=db_port, user=db_user, password=db_password)
Getting an initial initution of the Data Stored within the various tables.
Brands
SELECT * FROM customer.brands LIMIT 100
| id | name | own_brand |
|---|---|---|
| 7382 | Brand 7382 | FALSE |
| 5704 | Brand 5704 | FALSE |
| 5274 | Brand 5274 | TRUE |
| 5338 | Brand 5338 | FALSE |
| 5670 | Brand 5670 | FALSE |
Brands <- dbGetQuery(con,"SELECT * FROM customer.brands LIMIT 100")
str(Brands)
## 'data.frame': 11 obs. of 3 variables:
## $ id :integer64 7382 5704 5274 5338 5670 5258 5587 5291 ...
## $ name : chr "Brand 7382" "Brand 5704" "Brand 5274" "Brand 5338" ...
## $ own_brand: logi FALSE FALSE TRUE FALSE FALSE FALSE ...
Products
SELECT * FROM customer.products LIMIT 100
| id | main_product_id | brand_id | category_id | name | startprice |
|---|---|---|---|---|---|
| 1154650 | NA | 5258 | 6183 | Product 1154650 | 0 |
| 1472291 | NA | 5622 | 6183 | Product 1472291 | 0 |
| 1904246 | NA | 5371 | 6183 | Product 1904246 | 22357 |
| 1977346 | NA | 5275 | 6183 | Product 1977346 | 49990 |
| 2143085 | NA | 6562 | 6183 | Product 2143085 | 70000 |
products <- dbGetQuery(con, "SELECT * FROM customer.products LIMIT 100")
str(products)
## 'data.frame': 100 obs. of 6 variables:
## $ id :integer64 1154650 1472291 1904246 1977346 2143085 2157930 2181201 2181787 ...
## $ main_product_id:integer64 NA NA NA NA NA NA NA NA ...
## $ brand_id :integer64 5258 5622 5371 5275 6562 5371 7342 5704 ...
## $ category_id :integer64 6183 6183 6183 6183 6183 6183 6183 6183 ...
## $ name : chr "Product 1154650" "Product 1472291" "Product 1904246" "Product 1977346" ...
## $ startprice :integer64 0 0 22357 49990 70000 61599 37900 41900 ...
brand_traffic_by_week
SELECT * FROM customer.brand_traffic_by_week LIMIT 100
| week_start | website | device_type | brand_id | views | unique_views | unique_visitors |
|---|---|---|---|---|---|---|
| 2018-12-31 | AT | desktop | 5248 | 79 | 50 | 38 |
| 2018-12-31 | AT | desktop | 5258 | 1880 | 959 | 702 |
| 2018-12-31 | AT | desktop | 5260 | 747 | 406 | 305 |
| 2018-12-31 | AT | desktop | 5261 | 636 | 412 | 301 |
| 2018-12-31 | AT | desktop | 5274 | 7657 | 3745 | 2874 |
brand_traffic_by_week <- dbGetQuery(con, "SELECT * FROM customer.brand_traffic_by_week LIMIT 100")
str(brand_traffic_by_week)
## 'data.frame': 100 obs. of 7 variables:
## $ week_start : Date, format: "2018-12-31" "2018-12-31" ...
## $ website : chr "AT" "AT" "AT" "AT" ...
## $ device_type : chr "desktop" "desktop" "desktop" "desktop" ...
## $ brand_id :integer64 5248 5258 5260 5261 5274 5275 5291 5327 ...
## $ views :integer64 79 1880 747 636 7657 1 7244 128 ...
## $ unique_views :integer64 50 959 406 412 3745 1 4691 89 ...
## $ unique_visitors:integer64 38 702 305 301 2874 1 3521 70 ...
categories
SELECT * FROM customer.categories LIMIT 100
| id | name |
|---|---|
| 6183 | Category 1 |
| 6402 | Category 2 |
| 8201 | Category 3 |
categories <- dbGetQuery(con, "SELECT * FROM customer.categories LIMIT 100")
str(categories)
## 'data.frame': 3 obs. of 2 variables:
## $ id :integer64 6183 6402 8201
## $ name: chr "Category 1" "Category 2" "Category 3"
product_conversions_by_shop_by_day
SELECT * FROM customer.product_conversions_by_shop_by_day LIMIT 100
| date | website | product_id | device_type | shop_id | leadouts | checkouts | avg_leadout_price | avg_checkout_price |
|---|---|---|---|---|---|---|---|---|
| 2019-01-01 | AT | 4774094 | desktop | 14523 | 1 | 0 | 1290 | NA |
| 2019-01-01 | AT | 4852800 | desktop | 14523 | 1 | 0 | 478 | NA |
| 2019-01-01 | AT | 4854521 | desktop | 14523 | 1 | 0 | 534 | NA |
| 2019-01-01 | AT | 5755724 | desktop | 14523 | 1 | 0 | 533 | NA |
| 2019-01-01 | AT | 4709528 | mobile | 14523 | 1 | 0 | 262 | NA |
product_conversions_by_shop_by_day <- dbGetQuery(con, "SELECT * FROM customer.product_conversions_by_shop_by_day LIMIT 100")
str(product_conversions_by_shop_by_day)
## 'data.frame': 100 obs. of 9 variables:
## $ date : Date, format: "2019-01-01" "2019-01-01" ...
## $ website : chr "AT" "AT" "AT" "AT" ...
## $ product_id :integer64 4774094 4852800 4854521 5755724 4709528 5125099 5165136 5511073 ...
## $ device_type : chr "desktop" "desktop" "desktop" "desktop" ...
## $ shop_id :integer64 14523 14523 14523 14523 14523 14523 24759 24759 ...
## $ leadouts :integer64 1 1 1 1 1 1 1 1 ...
## $ checkouts :integer64 0 0 0 0 0 0 0 0 ...
## $ avg_leadout_price : num 1290 478 534 533 262 446 335 343 474 714 ...
## $ avg_checkout_price: num NA NA NA NA NA NA NA NA NA NA ...
shops
SELECT * FROM customer.shops
| id | country | name |
|---|---|---|
| 5382 | DE | Shop 223 |
| 5604 | DE | Shop 17 |
| 6622 | DE | Shop 319 |
| 6642 | DE | Shop 143 |
| 7282 | DE | Shop 189 |
shops <- dbGetQuery(con, "SELECT * FROM customer.shops")
str(shops)
## 'data.frame': 644 obs. of 3 variables:
## $ id :integer64 5382 5604 6622 6642 7282 7742 8882 9262 ...
## $ country: chr "DE" "DE" "DE" "DE" ...
## $ name : chr "Shop 223" "Shop 17" "Shop 319" "Shop 143" ...
product_traffic_by_day
SELECT * FROM customer.product_traffic_by_day LIMIT 100
| date | website | product_id | device_type | views | unique_views | unique_visitors |
|---|---|---|---|---|---|---|
| 2019-01-24 | ES | 4453281 | desktop | 12 | 8 | 8 |
| 2019-01-24 | AT | 6190750 | desktop | 1 | 1 | 1 |
| 2019-01-24 | AT | 5109796 | desktop | 1 | 1 | 1 |
| 2019-01-24 | FR | 6340006 | mobile | 2 | 2 | 2 |
| 2019-01-24 | UK | 5763200 | desktop | 1 | 1 | 1 |
product_traffic_by_day <- dbGetQuery(con, "SELECT * FROM customer.product_traffic_by_day LIMIT 100")
str(product_traffic_by_day)
## 'data.frame': 100 obs. of 7 variables:
## $ date : Date, format: "2019-01-24" "2019-01-24" ...
## $ website : chr "ES" "AT" "AT" "FR" ...
## $ product_id :integer64 4453281 6190750 5109796 6340006 5763200 6361620 5109796 5530574 ...
## $ device_type : chr "desktop" "desktop" "desktop" "mobile" ...
## $ views :integer64 12 1 1 2 1 2 2 1 ...
## $ unique_views :integer64 8 1 1 2 1 2 2 1 ...
## $ unique_visitors:integer64 8 1 1 2 1 2 2 1 ...
The “brands” table lists the customer’s own brand together with its top competitors. The customer’s own brand is flagged as “own_brand = true”
The “products” table lists all products in the categories where the customer is active, including those from brands that are not listed in the “brands” table
All products in the “products” table that have a “main_product_id” are to be considered a variant of such main product - for example a specific color or size. Metrics of such products should therefore be grouped by “main_product_id” when working on product reports
Conversion metrics are defined as follows:
A “leadout” is a click on idealo that leads to a partner shop
A “checkout” is a click on idealo that directly leads to a checkout on idealo itself
For each week in the data set, show the total views received by products of the customer’s brand together with the average total views received by products of other brands listed in the “brands” table (within the same week).
The total views are obtained by Aggegration and average views for other brands come form the brand_traffic_by_week table and are joined by brand ID
SELECT * FROM (SELECT DISTINCT sda.brand_id,
sda.Week,
--sda.own_brand,
Sum(sda.views) OVER (PARTITION BY sda.brand_id,
sda.Week) AS Total_views_Per_Brand,
BFT.views AS competitor_views
FROM
(SELECT *,
to_char(PTBD.date, 'IW-IYYY') AS WEEK
FROM
(SELECT *
FROM customer.products) Pro
LEFT JOIN
(SELECT *
FROM customer.Brands) Bra ON Pro.brand_id = Bra.id
LEFT JOIN
(SELECT *
FROM customer.product_traffic_by_day) PTBD ON PTBD.product_id = Pro.id) sda
LEFT JOIN
(SELECT avg(views) as views,brand_id,
to_char(week_start, 'IW-IYYY') AS WEEK
FROM customer.brand_traffic_by_week
GROUP BY brand_id,WEEK Order by brand_id) BFT ON Sda.week = BFT.week
AND Sda.brand_id = BFT.brand_id
WHERE sda.week IS NOT NULL ORDER BY sda.brand_id)A
| brand_id | week | total_views_per_brand | competitor_views |
|---|---|---|---|
| 5248 | 01-2019 | 53 | 1322.182 |
| 5248 | 02-2019 | 43 | 1207.167 |
| 5248 | 03-2019 | 41 | 1131.500 |
| 5248 | 04-2019 | 41 | 1160.083 |
| 5248 | 05-2019 | 683 | 1226.000 |
This displays Data only for the top 5 brands mentioned in the brands table. Commenting “WHERE brand_id in (SELECT id FROM customer.Brands LIMIT 5)” would provide all results
Total_views_per_brand_1 <- dbGetQuery(con,
"SELECT * FROM (SELECT DISTINCT sda.brand_id,
sda.Week,
--sda.own_brand,
Sum(sda.views) OVER (PARTITION BY sda.brand_id,
sda.Week) AS Total_views_Per_Brand,
BFT.views AS competitor_views
FROM
(SELECT *,
to_char(PTBD.date, 'IW-IYYY') AS WEEK
FROM
(SELECT *
FROM customer.products) Pro
LEFT JOIN
(SELECT *
FROM customer.Brands) Bra ON Pro.brand_id = Bra.id
LEFT JOIN
(SELECT *
FROM customer.product_traffic_by_day) PTBD ON PTBD.product_id = Pro.id) sda
LEFT JOIN
(SELECT avg(views) as views,brand_id,
to_char(week_start, 'IW-IYYY') AS WEEK
FROM customer.brand_traffic_by_week
GROUP BY brand_id,WEEK Order by brand_id) BFT ON Sda.week = BFT.week
AND Sda.brand_id = BFT.brand_id
WHERE sda.week IS NOT NULL ORDER BY sda.brand_id)A
WHERE brand_id in (SELECT id FROM customer.Brands LIMIT 5)
")
library(rmarkdown)
paged_table(Total_views_per_brand_1)
Visualization of the results
library(reshape2)
Total_views_per_brand_1$brand_id <- as.character(Total_views_per_brand_1$brand_id)
mdata3 <- melt(Total_views_per_brand_1, id=c("week","brand_id"))
library(directlabels) # Library for adding names to lines on plots
ggplot(mdata3 , aes(x = week, y = value, col = variable,group = 1)) + geom_line() + ylim(10, 5000)+
geom_dl(aes(label = variable) , method = list(dl.combine("last.points"), cex = 0.7)) +
facet_grid(brand_id ~ .) + ggtitle("Sample Brands vs Competitors Vizualization") +
theme(legend.position = "none")
For each week in the data set and only for mobile traffic in DE, show the unique visitors of the customer’s brand and the average unique visitors of other brands that are listed in the “brands” table (within the same week)
SELECT DISTINCT sda.brand_id,
sda.week,
sda.own_brand,
Sum(sda.unique_visitors)
OVER (
partition BY sda.brand_id, sda.week, sda.website, sda.device_type) AS
Total_weekly_unique_visits_Per_Brand,
BFT.unique_views
AS competitor_views
FROM (SELECT *,
To_char(PTBD.date, 'IW-IYYY') AS Week
FROM (SELECT *
FROM customer.products) Pro
LEFT JOIN (SELECT *
FROM customer.brands) Bra
ON Pro.brand_id = Bra.id
LEFT JOIN (SELECT *
FROM customer.product_traffic_by_day) PTBD
ON PTBD.product_id = Pro.id)sda
LEFT JOIN (SELECT avg(unique_visitors) as unique_views,brand_id,
To_char(week_start, 'IW-IYYY') AS week
FROM customer.brand_traffic_by_week
GROUP BY brand_id,WEEK Order by brand_id
)BFT
ON Sda.week = BFT.week
AND Sda.brand_id = BFT.brand_id
WHERE Sda.week IS NOT NULL
AND Upper(Sda.website) LIKE 'DE'
AND Upper(Sda.device_type) LIKE '%MOBILE%'
ORDER BY week ASC
| brand_id | week | own_brand | total_weekly_unique_visits_per_brand | competitor_views |
|---|---|---|---|---|
| 5248 | 05-2019 | NA | 179 | 545.0833 |
| 5258 | 05-2019 | FALSE | 4317 | 6271.9167 |
| 5274 | 05-2019 | TRUE | 4218 | 11192.2500 |
| 5275 | 05-2019 | FALSE | 1358 | 2.5000 |
| 5291 | 05-2019 | FALSE | 1095 | 24955.8333 |
This displays Data only for the top 5 brands mentioned in the brands table. Commenting “AND Sda.brand_id in (SELECT id FROM customer.Brands )” would provide all results
unique_visitors_of_the_customer_brand_2 <- dbGetQuery(con,
"SELECT DISTINCT sda.brand_id,
sda.week,
sda.own_brand,
Sum(sda.unique_visitors)
OVER (
partition BY sda.brand_id, sda.week, sda.website, sda.device_type) AS
Total_weekly_unique_visits_Per_Brand,
BFT.unique_views
AS competitor_views
FROM (SELECT *,
To_char(PTBD.date, 'IW-IYYY') AS Week
FROM (SELECT *
FROM customer.products) Pro
LEFT JOIN (SELECT *
FROM customer.brands) Bra
ON Pro.brand_id = Bra.id
LEFT JOIN (SELECT *
FROM customer.product_traffic_by_day) PTBD
ON PTBD.product_id = Pro.id)sda
LEFT JOIN (SELECT avg(unique_visitors) as unique_views,brand_id,
To_char(week_start, 'IW-IYYY') AS week
FROM customer.brand_traffic_by_week
GROUP BY brand_id,WEEK Order by brand_id
)BFT
ON Sda.week = BFT.week
AND Sda.brand_id = BFT.brand_id
WHERE Sda.week IS NOT NULL
AND Upper(Sda.website) LIKE 'DE'
AND Upper(Sda.device_type) LIKE '%MOBILE%'
AND Sda.brand_id in (SELECT id FROM customer.Brands )
ORDER BY week ASC ")
paged_table(unique_visitors_of_the_customer_brand_2)
Visualization of the results
library(reshape2)
unique_visitors_of_the_customer_brand_2$brand_id <- as.character(unique_visitors_of_the_customer_brand_2$brand_id)
mdata3 <- melt(unique_visitors_of_the_customer_brand_2, id=c("week","brand_id"))
library(directlabels) # Library for adding names to lines on plots
ggplot(mdata3 , aes(x = week, y = value, col = variable,group = 1)) + geom_line() + ylim(0, 5000) +
geom_dl(aes(label = variable) , method = list(dl.combine("last.points"), cex = 0.7)) +
facet_grid(brand_id ~ .) + ggtitle("Sample Brands vs Competitors Vizualization") +
theme(legend.position = "none")
Show the top 20 shops by leadouts for the customer’s brand and their average leadout price
top_20_shops_by_leadouts_4 <- dbGetQuery(con,"SELECT DISTINCT brand_id,
shop_id,
Shop_name as shop_name ,
SUM(leadouts) OVER (PARTITION BY brand_id,
shop_id) AS Total_leadout,
ROUND(avg(avg_leadout_price) OVER (PARTITION BY brand_id, shop_id),2) AS avg_leadout_price
FROM
(SELECT PCSD.*,
sho.country,
sho.name AS Shop_name,
Pro.*
FROM
(SELECT *
FROM customer.product_conversions_by_shop_by_day) PCSD
LEFT JOIN
(SELECT *
FROM customer.shops) Sho ON PCSD.shop_id = sho.id
LEFT JOIN
(SELECT *
FROM customer.products) Pro ON Pro.id = PCSD.product_id)A
ORDER BY Total_leadout DESC
LIMIT 20")
paged_table(top_20_shops_by_leadouts_4)
Visualization of the results
top_20_shops_by_leadouts_4 <- select(top_20_shops_by_leadouts_4, shop_name ,brand_id,avg_leadout_price,total_leadout)
top_20_shops_by_leadouts_4$brand_id <- as.character(top_20_shops_by_leadouts_4$brand_id)
top_20_shops_by_leadouts_4$shop_name <- as.character(top_20_shops_by_leadouts_4$shop_name)
mdata3 <-top_20_shops_by_leadouts_4
ggplot(data = mdata3, aes(x = shop_name, y = avg_leadout_price )) +
geom_bar(stat = "identity") +
ggtitle("Top Shops by Conversion rate")+
theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Show the top 20 products by checkout conversion rate (checkouts over views) for the customer’s brand and their average checkout price
top_20_products_by_checkout_conversion_rate <- dbGetQuery(con,"SELECT *,
ROUND(Total_checkout/Total_views_product, 2) AS conversion_rate
FROM
(SELECT DISTINCT A.product_id,
SUM(A.views) OVER (PARTITION BY A.product_id) AS Total_views_product,
SUM(A.Checkouts) OVER (PARTITION BY A.product_id) AS Total_checkout,
ROUND(Avg(avg_checkout_price) OVER (PARTITION BY A.product_id),2) AS Avg_checkout_price
FROM
(SELECT PCSD.*,
sho.country,
sho.name AS Shop_name,
Pro.*,
PTD.views,
PTD.unique_views,
PTD.unique_visitors
FROM
(SELECT *
FROM customer.product_conversions_by_shop_by_day) PCSD
LEFT JOIN
(SELECT *
FROM customer.shops) Sho ON PCSD.shop_id = sho.id
LEFT JOIN
(SELECT *
FROM customer.products) Pro ON Pro.id = PCSD.product_id
LEFT JOIN
(SELECT *
FROM customer.product_traffic_by_day) PTD ON PTD.product_id = PCSD.product_id
AND PTD.date = PCSD.date
AND UPPER(PTD.website) = UPPER(PCSD.website)
AND UPPER(PTD.device_type) = UPPER(PCSD.device_type)
WHERE checkouts>0)A)DUMMY
WHERE Total_views_product IS NOT NULL
AND Total_views_product>Total_checkout
ORDER BY conversion_rate DESC
LIMIT 20")
paged_table(top_20_products_by_checkout_conversion_rate)
Visualization of the results
top_20_products_by_checkout_conversion_rate <- select(top_20_products_by_checkout_conversion_rate, avg_checkout_price ,product_id,conversion_rate)
top_20_products_by_checkout_conversion_rate$product_id <- as.character(top_20_products_by_checkout_conversion_rate$product_id)
mdata3 <-top_20_products_by_checkout_conversion_rate
ggplot(data = mdata3, aes(x = product_id, y = avg_checkout_price )) +
geom_bar(stat = "identity")+ geom_text(aes(label=conversion_rate), vjust=-0.5,size=4) +
ggtitle("Top 20 products by checkout conversion_rate")+
theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
The vastness of the amount of data and important labels makes a single vizualization difficult to load.
This provides an oppourtinity for the creation of a Shiny web app tool.
The data can be furthur explored with deeper business insights.