1 Goal


The goal of this task is to conduct an Exploratory Data Analysis (EDA) on data provided by Idealo.
The data is stored on a AWS in the following base tables.
Brands
products
categories
shops
product_conversions_by_shop_by_day
brand_traffic_by_week
product_traffic_by_day


2 Universal Libraries


Theese libraries are used throught the code. Libraries specific to the line of code are defined with the code.

library(ggplot2)     # Used to Create Visualization 
library(dplyr)       # Used to Manipulate and Clean Data
library(reshape2)    # Used to Manipulate Data for Plotting
library(DBI)         # Used to query DB

3 Connection Parameters


Creating a Connection to the Database to Enable Postgres Queries.

db <- 'db_30444'  #provide the name of your db

host_db <- 'candidates-test-do-not-delete.cj0lhhdrhdo9.eu-central-1.rds.amazonaws.com' 

db_port <- '5432'  # or any other port specified by the DBA

db_user <- 'roshan_kumar_bhuyan'

db_password <- 'AEM~Jh]?!L}t2gh('

con <- dbConnect(RPostgres::Postgres(), dbname = db, host=host_db, port=db_port, user=db_user, password=db_password)

4 Exploring Tables


Getting an initial initution of the Data Stored within the various tables.

Brands

SELECT * FROM customer.brands LIMIT 100
Displaying records 1 - 5
id name own_brand
7382 Brand 7382 FALSE
5704 Brand 5704 FALSE
5274 Brand 5274 TRUE
5338 Brand 5338 FALSE
5670 Brand 5670 FALSE
Brands <- dbGetQuery(con,"SELECT * FROM customer.brands LIMIT 100")
str(Brands)
## 'data.frame':    11 obs. of  3 variables:
##  $ id       :integer64 7382 5704 5274 5338 5670 5258 5587 5291 ... 
##  $ name     : chr  "Brand 7382" "Brand 5704" "Brand 5274" "Brand 5338" ...
##  $ own_brand: logi  FALSE FALSE TRUE FALSE FALSE FALSE ...

Products

SELECT * FROM customer.products LIMIT 100
Displaying records 1 - 5
id main_product_id brand_id category_id name startprice
1154650 NA 5258 6183 Product 1154650 0
1472291 NA 5622 6183 Product 1472291 0
1904246 NA 5371 6183 Product 1904246 22357
1977346 NA 5275 6183 Product 1977346 49990
2143085 NA 6562 6183 Product 2143085 70000
products <- dbGetQuery(con, "SELECT * FROM customer.products LIMIT 100")
str(products)
## 'data.frame':    100 obs. of  6 variables:
##  $ id             :integer64 1154650 1472291 1904246 1977346 2143085 2157930 2181201 2181787 ... 
##  $ main_product_id:integer64 NA NA NA NA NA NA NA NA ... 
##  $ brand_id       :integer64 5258 5622 5371 5275 6562 5371 7342 5704 ... 
##  $ category_id    :integer64 6183 6183 6183 6183 6183 6183 6183 6183 ... 
##  $ name           : chr  "Product 1154650" "Product 1472291" "Product 1904246" "Product 1977346" ...
##  $ startprice     :integer64 0 0 22357 49990 70000 61599 37900 41900 ...

brand_traffic_by_week

SELECT * FROM customer.brand_traffic_by_week LIMIT 100
Displaying records 1 - 5
week_start website device_type brand_id views unique_views unique_visitors
2018-12-31 AT desktop 5248 79 50 38
2018-12-31 AT desktop 5258 1880 959 702
2018-12-31 AT desktop 5260 747 406 305
2018-12-31 AT desktop 5261 636 412 301
2018-12-31 AT desktop 5274 7657 3745 2874
brand_traffic_by_week <- dbGetQuery(con, "SELECT * FROM customer.brand_traffic_by_week LIMIT 100")
str(brand_traffic_by_week)
## 'data.frame':    100 obs. of  7 variables:
##  $ week_start     : Date, format: "2018-12-31" "2018-12-31" ...
##  $ website        : chr  "AT" "AT" "AT" "AT" ...
##  $ device_type    : chr  "desktop" "desktop" "desktop" "desktop" ...
##  $ brand_id       :integer64 5248 5258 5260 5261 5274 5275 5291 5327 ... 
##  $ views          :integer64 79 1880 747 636 7657 1 7244 128 ... 
##  $ unique_views   :integer64 50 959 406 412 3745 1 4691 89 ... 
##  $ unique_visitors:integer64 38 702 305 301 2874 1 3521 70 ...

categories

SELECT * FROM customer.categories LIMIT 100
3 records
id name
6183 Category 1
6402 Category 2
8201 Category 3
categories <- dbGetQuery(con, "SELECT * FROM customer.categories LIMIT 100")
str(categories)
## 'data.frame':    3 obs. of  2 variables:
##  $ id  :integer64 6183 6402 8201 
##  $ name: chr  "Category 1" "Category 2" "Category 3"

product_conversions_by_shop_by_day

SELECT * FROM customer.product_conversions_by_shop_by_day LIMIT 100
Displaying records 1 - 5
date website product_id device_type shop_id leadouts checkouts avg_leadout_price avg_checkout_price
2019-01-01 AT 4774094 desktop 14523 1 0 1290 NA
2019-01-01 AT 4852800 desktop 14523 1 0 478 NA
2019-01-01 AT 4854521 desktop 14523 1 0 534 NA
2019-01-01 AT 5755724 desktop 14523 1 0 533 NA
2019-01-01 AT 4709528 mobile 14523 1 0 262 NA
product_conversions_by_shop_by_day <- dbGetQuery(con, "SELECT * FROM customer.product_conversions_by_shop_by_day LIMIT 100")
str(product_conversions_by_shop_by_day)
## 'data.frame':    100 obs. of  9 variables:
##  $ date              : Date, format: "2019-01-01" "2019-01-01" ...
##  $ website           : chr  "AT" "AT" "AT" "AT" ...
##  $ product_id        :integer64 4774094 4852800 4854521 5755724 4709528 5125099 5165136 5511073 ... 
##  $ device_type       : chr  "desktop" "desktop" "desktop" "desktop" ...
##  $ shop_id           :integer64 14523 14523 14523 14523 14523 14523 24759 24759 ... 
##  $ leadouts          :integer64 1 1 1 1 1 1 1 1 ... 
##  $ checkouts         :integer64 0 0 0 0 0 0 0 0 ... 
##  $ avg_leadout_price : num  1290 478 534 533 262 446 335 343 474 714 ...
##  $ avg_checkout_price: num  NA NA NA NA NA NA NA NA NA NA ...

shops

SELECT * FROM customer.shops
Displaying records 1 - 5
id country name
5382 DE Shop 223
5604 DE Shop 17
6622 DE Shop 319
6642 DE Shop 143
7282 DE Shop 189
shops <- dbGetQuery(con, "SELECT * FROM customer.shops")
str(shops)
## 'data.frame':    644 obs. of  3 variables:
##  $ id     :integer64 5382 5604 6622 6642 7282 7742 8882 9262 ... 
##  $ country: chr  "DE" "DE" "DE" "DE" ...
##  $ name   : chr  "Shop 223" "Shop 17" "Shop 319" "Shop 143" ...

product_traffic_by_day

SELECT * FROM customer.product_traffic_by_day LIMIT 100
Displaying records 1 - 5
date website product_id device_type views unique_views unique_visitors
2019-01-24 ES 4453281 desktop 12 8 8
2019-01-24 AT 6190750 desktop 1 1 1
2019-01-24 AT 5109796 desktop 1 1 1
2019-01-24 FR 6340006 mobile 2 2 2
2019-01-24 UK 5763200 desktop 1 1 1
product_traffic_by_day <- dbGetQuery(con, "SELECT * FROM customer.product_traffic_by_day LIMIT 100")
str(product_traffic_by_day)
## 'data.frame':    100 obs. of  7 variables:
##  $ date           : Date, format: "2019-01-24" "2019-01-24" ...
##  $ website        : chr  "ES" "AT" "AT" "FR" ...
##  $ product_id     :integer64 4453281 6190750 5109796 6340006 5763200 6361620 5109796 5530574 ... 
##  $ device_type    : chr  "desktop" "desktop" "desktop" "mobile" ...
##  $ views          :integer64 12 1 1 2 1 2 2 1 ... 
##  $ unique_views   :integer64 8 1 1 2 1 2 2 1 ... 
##  $ unique_visitors:integer64 8 1 1 2 1 2 2 1 ...

5 Additional Information Provided for the Task


The “brands” table lists the customer’s own brand together with its top competitors. The customer’s own brand is flagged as “own_brand = true”

The “products” table lists all products in the categories where the customer is active, including those from brands that are not listed in the “brands” table

All products in the “products” table that have a “main_product_id” are to be considered a variant of such main product - for example a specific color or size. Metrics of such products should therefore be grouped by “main_product_id” when working on product reports

Conversion metrics are defined as follows:
A “leadout” is a click on idealo that leads to a partner shop
A “checkout” is a click on idealo that directly leads to a checkout on idealo itself


6 Task 1- Total views received


For each week in the data set, show the total views received by products of the customer’s brand together with the average total views received by products of other brands listed in the “brands” table (within the same week).

The total views are obtained by Aggegration and average views for other brands come form the brand_traffic_by_week table and are joined by brand ID

SELECT * FROM (SELECT DISTINCT sda.brand_id,
                sda.Week,
                --sda.own_brand,
                Sum(sda.views) OVER (PARTITION BY sda.brand_id,
                                                  sda.Week) AS Total_views_Per_Brand,
                                    BFT.views AS competitor_views
FROM
  (SELECT *,
          to_char(PTBD.date, 'IW-IYYY') AS WEEK
   FROM
     (SELECT *
      FROM customer.products) Pro
   LEFT JOIN
     (SELECT *
      FROM customer.Brands) Bra ON Pro.brand_id = Bra.id
   LEFT JOIN
     (SELECT *
      FROM customer.product_traffic_by_day) PTBD ON PTBD.product_id = Pro.id) sda
LEFT JOIN
  (SELECT avg(views) as views,brand_id,
          to_char(week_start, 'IW-IYYY') AS WEEK
   FROM customer.brand_traffic_by_week 
   GROUP BY brand_id,WEEK Order by brand_id) BFT ON Sda.week = BFT.week
AND Sda.brand_id = BFT.brand_id
WHERE sda.week IS NOT NULL  ORDER BY sda.brand_id)A
Displaying records 1 - 5
brand_id week total_views_per_brand competitor_views
5248 01-2019 53 1322.182
5248 02-2019 43 1207.167
5248 03-2019 41 1131.500
5248 04-2019 41 1160.083
5248 05-2019 683 1226.000

This displays Data only for the top 5 brands mentioned in the brands table. Commenting “WHERE brand_id in (SELECT id FROM customer.Brands LIMIT 5)” would provide all results

Total_views_per_brand_1 <- dbGetQuery(con,
"SELECT * FROM (SELECT DISTINCT sda.brand_id,
                sda.Week,
                --sda.own_brand,
                Sum(sda.views) OVER (PARTITION BY sda.brand_id,
                                                  sda.Week) AS Total_views_Per_Brand,
                                    BFT.views AS competitor_views
FROM
  (SELECT *,
          to_char(PTBD.date, 'IW-IYYY') AS WEEK
   FROM
     (SELECT *
      FROM customer.products) Pro
   LEFT JOIN
     (SELECT *
      FROM customer.Brands) Bra ON Pro.brand_id = Bra.id
   LEFT JOIN
     (SELECT *
      FROM customer.product_traffic_by_day) PTBD ON PTBD.product_id = Pro.id) sda
LEFT JOIN
  (SELECT avg(views) as views,brand_id,
          to_char(week_start, 'IW-IYYY') AS WEEK
   FROM customer.brand_traffic_by_week 
   GROUP BY brand_id,WEEK Order by brand_id) BFT ON Sda.week = BFT.week
AND Sda.brand_id = BFT.brand_id
WHERE sda.week IS NOT NULL  ORDER BY sda.brand_id)A

WHERE brand_id in (SELECT id FROM customer.Brands LIMIT 5)
")

library(rmarkdown)
paged_table(Total_views_per_brand_1)

Visualization of the results

library(reshape2)
Total_views_per_brand_1$brand_id <- as.character(Total_views_per_brand_1$brand_id)  
mdata3 <- melt(Total_views_per_brand_1, id=c("week","brand_id"))

library(directlabels) # Library for adding names to lines on plots
ggplot(mdata3 , aes(x = week, y = value, col = variable,group = 1)) + geom_line() +  ylim(10, 5000)+
  geom_dl(aes(label = variable) , method = list(dl.combine("last.points"), cex = 0.7)) +
  facet_grid(brand_id ~ .) + ggtitle("Sample Brands vs Competitors Vizualization") +
  theme(legend.position = "none")


7 Task 2 - Mobile Traffic in DE and unique visitors


For each week in the data set and only for mobile traffic in DE, show the unique visitors of the customer’s brand and the average unique visitors of other brands that are listed in the “brands” table (within the same week)

SELECT DISTINCT sda.brand_id, 
                sda.week, 
                sda.own_brand, 
                Sum(sda.unique_visitors) 
OVER ( 
  partition BY sda.brand_id, sda.week, sda.website, sda.device_type) AS 
                Total_weekly_unique_visits_Per_Brand, 
                BFT.unique_views 
                AS competitor_views 
FROM   (SELECT *, 
               To_char(PTBD.date, 'IW-IYYY') AS Week 
        FROM   (SELECT * 
                FROM   customer.products) Pro 
               LEFT JOIN (SELECT * 
                          FROM   customer.brands) Bra 
                      ON Pro.brand_id = Bra.id 
               LEFT JOIN (SELECT * 
                          FROM   customer.product_traffic_by_day) PTBD 
                      ON PTBD.product_id = Pro.id)sda 
       LEFT JOIN (SELECT avg(unique_visitors) as unique_views,brand_id,
                         To_char(week_start, 'IW-IYYY') AS week 
                  FROM   customer.brand_traffic_by_week
                 GROUP BY brand_id,WEEK Order by brand_id
                 )BFT 
              ON Sda.week = BFT.week 
                 AND Sda.brand_id = BFT.brand_id 
WHERE  Sda.week IS NOT NULL 
       AND Upper(Sda.website) LIKE 'DE' 
       AND Upper(Sda.device_type) LIKE '%MOBILE%' 
ORDER  BY week ASC
Displaying records 1 - 5
brand_id week own_brand total_weekly_unique_visits_per_brand competitor_views
5248 05-2019 NA 179 545.0833
5258 05-2019 FALSE 4317 6271.9167
5274 05-2019 TRUE 4218 11192.2500
5275 05-2019 FALSE 1358 2.5000
5291 05-2019 FALSE 1095 24955.8333

This displays Data only for the top 5 brands mentioned in the brands table. Commenting “AND Sda.brand_id in (SELECT id FROM customer.Brands )” would provide all results

unique_visitors_of_the_customer_brand_2 <- dbGetQuery(con,
    "SELECT DISTINCT sda.brand_id, 
                sda.week, 
                sda.own_brand, 
                Sum(sda.unique_visitors) 
OVER ( 
  partition BY sda.brand_id, sda.week, sda.website, sda.device_type) AS 
                Total_weekly_unique_visits_Per_Brand, 
                BFT.unique_views 
                AS competitor_views 
FROM   (SELECT *, 
               To_char(PTBD.date, 'IW-IYYY') AS Week 
        FROM   (SELECT * 
                FROM   customer.products) Pro 
               LEFT JOIN (SELECT * 
                          FROM   customer.brands) Bra 
                      ON Pro.brand_id = Bra.id 
               LEFT JOIN (SELECT * 
                          FROM   customer.product_traffic_by_day) PTBD 
                      ON PTBD.product_id = Pro.id)sda 
       LEFT JOIN (SELECT avg(unique_visitors) as unique_views,brand_id,
                         To_char(week_start, 'IW-IYYY') AS week 
                  FROM   customer.brand_traffic_by_week
                 GROUP BY brand_id,WEEK Order by brand_id
                 )BFT 
              ON Sda.week = BFT.week 
                 AND Sda.brand_id = BFT.brand_id 
WHERE  Sda.week IS NOT NULL 
       AND Upper(Sda.website) LIKE 'DE' 
       AND Upper(Sda.device_type) LIKE '%MOBILE%' 
       AND  Sda.brand_id in (SELECT id FROM customer.Brands )
ORDER  BY week ASC ")

paged_table(unique_visitors_of_the_customer_brand_2)

Visualization of the results

library(reshape2)
unique_visitors_of_the_customer_brand_2$brand_id <- as.character(unique_visitors_of_the_customer_brand_2$brand_id)  
mdata3 <- melt(unique_visitors_of_the_customer_brand_2, id=c("week","brand_id"))

library(directlabels) # Library for adding names to lines on plots
ggplot(mdata3 , aes(x = week, y = value, col = variable,group = 1)) + geom_line() +  ylim(0, 5000) +
  geom_dl(aes(label = variable) , method = list(dl.combine("last.points"), cex = 0.7)) +
  facet_grid(brand_id ~ .) + ggtitle("Sample Brands vs Competitors Vizualization") +
  theme(legend.position = "none")


8 Task 3 - Share of product views, in percentage


Show the all-time share of product views, in percentage, for all brands in the data set. Flag as “others” the views of those brands which are not listed in the “brands” table

NOTE: Other are listed as 0

NOTE: The Data here is filtered to one brand. Commenting “WHERE brand_id in (SELECT id FROM customer.Brands limit 1)” provides data for all brands. This is done in order to make the visualization faster.

share_of_product_views_3 <- dbGetQuery(con,"SELECT *,
       COALESCE(ROUND((Total_views_Per_product/NULLIF(Total_views_Per_brand, 0))*100, 3), 0) AS Percentage_view
FROM
  (SELECT DISTINCT sda.id,
                   sda.brand_id,
                   Sum(sda.views) OVER (PARTITION BY sda.id) AS Total_views_Per_product,
                                       Sum(sda.views) OVER (PARTITION BY sda.brand_id) AS Total_views_Per_brand
   FROM
     (SELECT Pro.id,
             COALESCE(Pro.brand_id, 0) AS brand_id,
             Pro.category_id,
             Pro.name,
             Pro.startprice,
             Bra.own_brand,
             PTBD.date,
             to_char(PTBD.date, 'IW-IYYY') AS WEEK,
             PTBD.website,
             PTBD.product_id,
             PTBD.device_type,
             COALESCE (PTBD.views,
                       0) AS VIEWS,
                      COALESCE (PTBD.unique_views,
                                0) AS unique_views,
                               COALESCE (PTBD.unique_visitors,
                                         0) AS unique_visitors
      FROM
        (SELECT COALESCE (main_product_id,
                          id) AS id,
                         brand_id,
                         category_id,
                         name,
                         startprice
         FROM customer.products) Pro
      LEFT JOIN
        (SELECT *
         FROM customer.Brands) Bra ON Pro.brand_id = Bra.id
      LEFT JOIN
        (SELECT *
         FROM customer.product_traffic_by_day) PTBD ON PTBD.product_id = Pro.id)sda)A
         WHERE brand_id in (SELECT id FROM customer.Brands limit 1) 
ORDER BY Percentage_view DESC limit 100
")

paged_table(share_of_product_views_3)

Visualization of the results

share_of_product_views_3 <- select(share_of_product_views_3, id,brand_id,percentage_view)
share_of_product_views_3$brand_id <- as.character(share_of_product_views_3$brand_id)  
share_of_product_views_3$id <- as.character(share_of_product_views_3$id)  
mdata3 <-share_of_product_views_3
ggplot(data = mdata3, aes(x = brand_id, y = percentage_view, fill = id)) +
  geom_bar(stat = "identity") +
  ggtitle("Product Percentage of views")+
  theme(plot.title = element_text(hjust = 0.5))


9 Task 4 - Top 20 shops by leadouts


Show the top 20 shops by leadouts for the customer’s brand and their average leadout price

top_20_shops_by_leadouts_4  <- dbGetQuery(con,"SELECT DISTINCT brand_id,
                shop_id,
                Shop_name as shop_name ,
                SUM(leadouts) OVER (PARTITION BY brand_id,
                                                 shop_id) AS Total_leadout,
                                   ROUND(avg(avg_leadout_price) OVER (PARTITION BY brand_id, shop_id),2) AS avg_leadout_price
FROM
  (SELECT PCSD.*,
          sho.country,
          sho.name AS Shop_name,
          Pro.*
   FROM
     (SELECT *
      FROM customer.product_conversions_by_shop_by_day) PCSD
   LEFT JOIN
     (SELECT *
      FROM customer.shops) Sho ON PCSD.shop_id = sho.id
   LEFT JOIN
     (SELECT *
      FROM customer.products) Pro ON Pro.id = PCSD.product_id)A
ORDER BY Total_leadout DESC
LIMIT 20")

paged_table(top_20_shops_by_leadouts_4)

Visualization of the results

top_20_shops_by_leadouts_4 <- select(top_20_shops_by_leadouts_4, shop_name ,brand_id,avg_leadout_price,total_leadout)
top_20_shops_by_leadouts_4$brand_id <- as.character(top_20_shops_by_leadouts_4$brand_id)
top_20_shops_by_leadouts_4$shop_name <- as.character(top_20_shops_by_leadouts_4$shop_name) 
 
mdata3 <-top_20_shops_by_leadouts_4
ggplot(data = mdata3, aes(x = shop_name, y = avg_leadout_price )) +
  geom_bar(stat = "identity") +  
  ggtitle("Top Shops by Conversion rate")+
  theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


10 Task 5 - Top 20 products by checkout conversion rate


Show the top 20 products by checkout conversion rate (checkouts over views) for the customer’s brand and their average checkout price

top_20_products_by_checkout_conversion_rate  <- dbGetQuery(con,"SELECT *,
       ROUND(Total_checkout/Total_views_product, 2) AS conversion_rate
FROM
  (SELECT DISTINCT A.product_id,
                   SUM(A.views) OVER (PARTITION BY A.product_id) AS Total_views_product,
                                     SUM(A.Checkouts) OVER (PARTITION BY A.product_id) AS Total_checkout,
                                                           ROUND(Avg(avg_checkout_price) OVER (PARTITION BY A.product_id),2) AS Avg_checkout_price
   FROM
     (SELECT PCSD.*,
             sho.country,
             sho.name AS Shop_name,
             Pro.*,
             PTD.views,
             PTD.unique_views,
             PTD.unique_visitors
      FROM
        (SELECT *
         FROM customer.product_conversions_by_shop_by_day) PCSD
      LEFT JOIN
        (SELECT *
         FROM customer.shops) Sho ON PCSD.shop_id = sho.id
      LEFT JOIN
        (SELECT *
         FROM customer.products) Pro ON Pro.id = PCSD.product_id
      LEFT JOIN
        (SELECT *
         FROM customer.product_traffic_by_day) PTD ON PTD.product_id = PCSD.product_id
      AND PTD.date = PCSD.date
      AND UPPER(PTD.website) = UPPER(PCSD.website)
      AND UPPER(PTD.device_type) = UPPER(PCSD.device_type)
      WHERE checkouts>0)A)DUMMY
WHERE Total_views_product IS NOT NULL
  AND Total_views_product>Total_checkout
ORDER BY conversion_rate DESC
LIMIT 20")

paged_table(top_20_products_by_checkout_conversion_rate)

Visualization of the results

top_20_products_by_checkout_conversion_rate <- select(top_20_products_by_checkout_conversion_rate, avg_checkout_price ,product_id,conversion_rate)
top_20_products_by_checkout_conversion_rate$product_id <- as.character(top_20_products_by_checkout_conversion_rate$product_id)


mdata3 <-top_20_products_by_checkout_conversion_rate
ggplot(data = mdata3, aes(x = product_id, y = avg_checkout_price )) +
  geom_bar(stat = "identity")+ geom_text(aes(label=conversion_rate), vjust=-0.5,size=4) +
              ggtitle("Top 20 products by checkout conversion_rate")+
              theme(plot.title = element_text(hjust = 0.5)) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) 


11 Observations and Conclusion


The vastness of the amount of data and important labels makes a single vizualization difficult to load.
This provides an oppourtinity for the creation of a Shiny web app tool.
The data can be furthur explored with deeper business insights.