Load the toolset.
# Load required toolset
library(readr)
library(dplyr)
library(ggplot2)
library(DescTools)
library(lubridate)
library(knitr)
# Simple function for arranging factors for plotting.
f <- function(x){
require(forcats)
fct_rev(fct_inorder(x))
}
Using the data provided, I am asked to provide insight into the following three questions:
The data sets to be used can be found here: https://www.kaggle.com/olistbr/marketing-funnel-olist.
Discussion points for the telephone meeting:
Therefore, I will be using an R Markdown notebook to share results as this is a good format to achieve goals above. R Markdown notebooks allow me to weave together narrative text, code, and visuals in a nicely formatted document.
Most of the analytical choices I made came from a detailed exploration of the available data. A condensed form of what I did is included in the section below. We can skip this section to save time.
# Import marketing funnel data
mql <- read_csv("olist_marketing_qualified_leads_dataset.csv",
col_types = cols(first_contact_date = col_date(format = "%Y-%m-%d")))
deals <- read_csv("olist_closed_deals_dataset.csv",
col_types = cols(won_date = col_datetime(format = "%Y-%m-%d %H:%M:%S")))
deals$won_date <- as.Date(deals$won_date) # Convert date
# Import revenue data
items <- read_csv("olist_order_items_dataset.csv",
col_types = cols(order_item_id = col_integer(),
shipping_limit_date = col_skip()))
orders <- read_csv("olist_orders_dataset.csv",
col_types = cols(order_approved_at = col_skip(),
order_delivered_carrier_date = col_skip(),
order_delivered_customer_date = col_skip(),
order_estimated_delivery_date = col_skip(),
customer_id = col_skip(),
order_purchase_timestamp = col_datetime(format = "%Y-%m-%d %H:%M:%S")))
orders$order_purchase_timestamp <- as.Date(orders$order_purchase_timestamp) # Convert date
View data and compute selected summary stats:
kable(sample_n(mql, 15), caption = "Sample of MQL Data") # View data
| mql_id | first_contact_date | landing_page_id | origin |
|---|---|---|---|
| b90658ac073c1d522b88fe8d00ba799f | 2018-04-30 | 9fa37f5177115156b1814ffc237402a6 | paid_search |
| de316fb56f066179bf15b2f2a69f53dd | 2018-03-27 | 73f31a40697cc90a86c1273563ac230e | other |
| 1e2a84cd1c6fcd7b7a07f49f1cf7f6cc | 2018-05-08 | 22c29808c4f815213303f8933030604c | organic_search |
| 83218450304f89053114eaa3b1487815 | 2018-01-04 | 40dec9f3d5259a3d2dbcdab2114fae47 | paid_search |
| f5772c97a8e2bcf909674d8701480d55 | 2018-01-30 | ce1a65abd0973638f1c887a6efcfa82d | organic_search |
| 89599d85cfa823d8886146af161ea5b3 | 2018-04-16 | 22c29808c4f815213303f8933030604c | organic_search |
| 73752e30dc460dabd746d6ebba649e19 | 2018-03-21 | f017be4dbf86243af5c1ebed0cff36a2 | organic_search |
| 5290f86dffbdf4f7673878fe5a544535 | 2018-05-24 | 35c9b150ab36fe584c1f24fd458c453a | paid_search |
| 4e9e9815a54fc4d54e960e96c843c4cb | 2018-05-10 | 22c29808c4f815213303f8933030604c | organic_search |
| 88b60db46229bc2e21574705df09a08b | 2018-05-08 | 58326e62183c14b0c03085c33b9fdc44 | social |
| c5e72242931c10ddd2b0ceb628df444c | 2018-03-16 | 22c29808c4f815213303f8933030604c | paid_search |
| 2483125d070c795ba1ed28b241c5c737 | 2018-05-26 | 58326e62183c14b0c03085c33b9fdc44 | social |
| 85b88bc0f811d78392c3f2fdc45f9ee2 | 2018-01-16 | b76ef37428e6799c421989521c0e5077 | unknown |
| 11b3e5a318f289cbcaedad1f899a3345 | 2018-02-03 | b6cd08ac77c501fd6857f7916d2c24f3 | |
| 0b97be8b4b408a3a0780a32e0a143a7f | 2018-01-22 | b76ef37428e6799c421989521c0e5077 | unknown |
Desc(mql, plotit = F) # Explore
## ------------------------------------------------------------------------------
## Describe mql (spec_tbl_df, tbl_df, tbl, data.frame):
##
## data frame: 8000 obs. of 4 variables
## 7940 complete cases (99.2%)
##
## Nr ColName Class NAs Levels
## 1 mql_id character .
## 2 first_contact_date Date .
## 3 landing_page_id character .
## 4 origin character 60 (0.8%)
##
##
## ------------------------------------------------------------------------------
## 1 - mql_id (character)
##
## length n NAs unique levels dupes
## 8'000 8'000 0 8'000 8'000 n
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 0002ac0d783338cfeab0b2bdbd872cda 1 0.0% 1 0.0%
## 2 000dd3543ac84d906eae52e7c779bb2a 1 0.0% 2 0.0%
## 3 001d3439223b7bb23ed89b9c8890d096 1 0.0% 3 0.0%
## 4 00275937100699204a3d5ae3caf7190d 1 0.0% 4 0.0%
## 5 003d859bbd84a526b9e06340ce7d185f 1 0.0% 5 0.1%
## 6 00532c4adc6d5dadb6d58e9de817b59a 1 0.0% 6 0.1%
## 7 005b0c27e7224dabb8c1c7346ceea228 1 0.0% 7 0.1%
## 8 0067035172fc786488ff7c6317ed88c9 1 0.0% 8 0.1%
## 9 00701bfa411bb151a8516b9730037148 1 0.0% 9 0.1%
## 10 007491783420080b25639249163d109d 1 0.0% 10 0.1%
## 11 0091e45c86dc2efef2743c810c61a3bb 1 0.0% 11 0.1%
## 12 0093fe2eedeb098315bf9251da1a5f03 1 0.0% 12 0.1%
## ... etc.
## [list output truncated]
##
## ------------------------------------------------------------------------------
## 2 - first_contact_date (Date)
##
## length n NAs unique
## 8'000 8'000 0 336
## 100.0% 0.0%
##
## lowest : 2017-06-14 (2), 2017-06-16, 2017-06-20, 2017-07-02
## highest: 2018-05-28 (57), 2018-05-29 (37), 2018-05-30 (30), 2018-05-31 (26)
##
##
## Weekday:
##
## Pearson's Chi-squared test (1-dim uniform):
## X-squared = 996.68, df = 6, p-value < 2.2e-16
##
## level freq perc cumfreq cumperc
## 1 Monday 1'495 18.7% 1'495 18.7%
## 2 Tuesday 1'452 18.1% 2'947 36.8%
## 3 Wednesday 1'491 18.6% 4'438 55.5%
## 4 Thursday 1'354 16.9% 5'792 72.4%
## 5 Friday 1'150 14.4% 6'942 86.8%
## 6 Saturday 527 6.6% 7'469 93.4%
## 7 Sunday 531 6.6% 8'000 100.0%
##
## Months:
##
## Pearson's Chi-squared test (1-dim uniform):
## X-squared = 3965.8, df = 11, p-value < 2.2e-16
##
## level freq perc cumfreq cumperc
## 1 January 1'141 14.3% 1'141 14.3%
## 2 February 1'028 12.8% 2'169 27.1%
## 3 March 1'174 14.7% 3'343 41.8%
## 4 April 1'352 16.9% 4'695 58.7%
## 5 May 1'303 16.3% 5'998 75.0%
## 6 June 4 0.0% 6'002 75.0%
## 7 July 239 3.0% 6'241 78.0%
## 8 August 386 4.8% 6'627 82.8%
## 9 September 312 3.9% 6'939 86.7%
## 10 October 416 5.2% 7'355 91.9%
## 11 November 445 5.6% 7'800 97.5%
## 12 December 200 2.5% 8'000 100.0%
##
## By weeks :
##
## level freq perc cumfreq cumperc
## 1 2017-06-12 3 0.0% 3 0.0%
## 2 2017-06-19 1 0.0% 4 0.0%
## 3 2017-06-26 1 0.0% 5 0.1%
## 4 2017-07-03 42 0.5% 47 0.6%
## 5 2017-07-10 51 0.6% 98 1.2%
## 6 2017-07-17 67 0.8% 165 2.1%
## 7 2017-07-24 59 0.7% 224 2.8%
## 8 2017-07-31 87 1.1% 311 3.9%
## 9 2017-08-07 83 1.0% 394 4.9%
## 10 2017-08-14 91 1.1% 485 6.1%
## 11 2017-08-21 85 1.1% 570 7.1%
## 12 2017-08-28 77 1.0% 647 8.1%
## 13 2017-09-04 69 0.9% 716 8.9%
## 14 2017-09-11 90 1.1% 806 10.1%
## 15 2017-09-18 69 0.9% 875 10.9%
## 16 2017-09-25 74 0.9% 949 11.9%
## 17 2017-10-02 90 1.1% 1'039 13.0%
## 18 2017-10-09 61 0.8% 1'100 13.8%
## 19 2017-10-16 105 1.3% 1'205 15.1%
## 20 2017-10-23 117 1.5% 1'322 16.5%
## 21 2017-10-30 90 1.1% 1'412 17.6%
## 22 2017-11-06 96 1.2% 1'508 18.9%
## 23 2017-11-13 99 1.2% 1'607 20.1%
## 24 2017-11-20 126 1.6% 1'733 21.7%
## 25 2017-11-27 88 1.1% 1'821 22.8%
## 26 2017-12-04 26 0.3% 1'847 23.1%
## 27 2017-12-11 33 0.4% 1'880 23.5%
## 28 2017-12-18 43 0.5% 1'923 24.0%
## 29 2017-12-25 79 1.0% 2'002 25.0%
## 30 2018-01-01 216 2.7% 2'218 27.7%
## 31 2018-01-08 257 3.2% 2'475 30.9%
## 32 2018-01-15 247 3.1% 2'722 34.0%
## 33 2018-01-22 268 3.4% 2'990 37.4%
## 34 2018-01-29 271 3.4% 3'261 40.8%
## 35 2018-02-05 264 3.3% 3'525 44.1%
## 36 2018-02-12 220 2.8% 3'745 46.8%
## 37 2018-02-19 270 3.4% 4'015 50.2%
## 38 2018-02-26 261 3.3% 4'276 53.4%
## 39 2018-03-05 261 3.3% 4'537 56.7%
## 40 2018-03-12 298 3.7% 4'835 60.4%
## 41 2018-03-19 279 3.5% 5'114 63.9%
## 42 2018-03-26 250 3.1% 5'364 67.0%
## 43 2018-04-02 338 4.2% 5'702 71.3%
## 44 2018-04-09 298 3.7% 6'000 75.0%
## 45 2018-04-16 328 4.1% 6'328 79.1%
## 46 2018-04-23 316 4.0% 6'644 83.0%
## 47 2018-04-30 303 3.8% 6'947 86.8%
## 48 2018-05-07 288 3.6% 7'235 90.4%
## 49 2018-05-14 304 3.8% 7'539 94.2%
## 50 2018-05-21 311 3.9% 7'850 98.1%
## 51 2018-05-28 150 1.9% 8'000 100.0%
##
## ------------------------------------------------------------------------------
## 3 - landing_page_id (character)
##
## length n NAs unique levels dupes
## 8'000 8'000 0 495 495 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 b76ef37428e6799c421989521c0e5077 912 11.4% 912 11.4%
## 2 22c29808c4f815213303f8933030604c 883 11.0% 1'795 22.4%
## 3 58326e62183c14b0c03085c33b9fdc44 495 6.2% 2'290 28.6%
## 4 88740e65d5d6b056e0cda098e1ea6313 445 5.6% 2'735 34.2%
## 5 ce1a65abd0973638f1c887a6efcfa82d 394 4.9% 3'129 39.1%
## 6 40dec9f3d5259a3d2dbcdab2114fae47 330 4.1% 3'459 43.2%
## 7 f017be4dbf86243af5c1ebed0cff36a2 310 3.9% 3'769 47.1%
## 8 e492ee5eaf1697716985cc6f33f9cd9b 291 3.6% 4'060 50.8%
## 9 a7982125ff7aa3b2054c6e44f9d28522 156 1.9% 4'216 52.7%
## 10 73f31a40697cc90a86c1273563ac230e 115 1.4% 4'331 54.1%
## 11 241f79c7a8fe0270f4fb79fcbbcd17ad 109 1.4% 4'440 55.5%
## 12 65d9f9d71e562365e8b44037c2888d98 95 1.2% 4'535 56.7%
## ... etc.
## [list output truncated]
##
## ------------------------------------------------------------------------------
## 4 - origin (character)
##
## length n NAs unique levels dupes
## 8'000 7'940 60 10 10 y
## 99.2% 0.8%
##
## level freq perc cumfreq cumperc
## 1 organic_search 2'296 28.9% 2'296 28.9%
## 2 paid_search 1'586 20.0% 3'882 48.9%
## 3 social 1'350 17.0% 5'232 65.9%
## 4 unknown 1'099 13.8% 6'331 79.7%
## 5 direct_traffic 499 6.3% 6'830 86.0%
## 6 email 493 6.2% 7'323 92.2%
## 7 referral 284 3.6% 7'607 95.8%
## 8 other 150 1.9% 7'757 97.7%
## 9 display 118 1.5% 7'875 99.2%
## 10 other_publicities 65 0.8% 7'940 100.0%
# Plot MQLs over time
mutate(mql, week = floor_date(first_contact_date, "week")) %>%
group_by(week) %>%
summarise(count = n()) %>%
ggplot(aes(x = week, y = count)) +
geom_area(color = "blue", fill = "blue", alpha = 0.3) +
labs(title = "Leads by Week", x = NULL, y = "Number of Leads")
Observations and recommendations:
landing_page_ids which seems like a lot. How many tests were conducted? How many per test? We should investigate how the tracking is set up.View data and compute selected summary stats:
glimpse(deals, ) # Better way to view wide data.
## Rows: 842
## Columns: 14
## $ mql_id <chr> "5420aad7fec3549a85876ba1c529bd84", "...
## $ seller_id <chr> "2c43fb513632d29b3b58df74816f1b06", "...
## $ sdr_id <chr> "a8387c01a09e99ce014107505b92388c", "...
## $ sr_id <chr> "4ef15afb4b2723d8f3d81e51ec7afefe", "...
## $ won_date <date> 2018-02-26, 2018-05-08, 2018-06-05, ...
## $ business_segment <chr> "pet", "car_accessories", "home_appli...
## $ lead_type <chr> "online_medium", "industry", "online_...
## $ lead_behaviour_profile <chr> "cat", "eagle", "cat", NA, "wolf", NA...
## $ has_company <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ has_gtin <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ average_stock <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ business_type <chr> "reseller", "reseller", "reseller", "...
## $ declared_product_catalog_size <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ declared_monthly_revenue <dbl> 0e+00, 0e+00, 0e+00, 0e+00, 0e+00, 0e...
Desc(deals$declared_monthly_revenue, plotit = F)
## ------------------------------------------------------------------------------
## deals$declared_monthly_revenue (numeric)
##
## length n NAs unique 0s mean meanCI
## 842 842 0 27 797 73'377.68 -44'644.32
## 100.0% 0.0% 94.7% 191'399.68
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 0.00 0.00 0.00 3'850.00
##
## range sd vcoef mad IQR skew kurt
## 5.00e+07 1'744'799.18 23.78 0.00 0.00 27.94 793.73
##
## lowest : 0.0 (797), 6.0, 1'000.0, 4'000.0, 5'000.0 (2)
## highest: 250'000.0 (2), 300'000.0 (2), 500'000.0, 8'000'000.0, 5.00e+07
##
## heap(?): remarkable frequency (94.7%) for the mode(s) (= 0)
Observations and recommendations:
declared_monthly_revenue to calculate the potential revenue from a typical client? No. This is mostly zeros.The Kaggle site directs users to another page to get comprehensive data on revenues. The link is here: https://www.kaggle.com/olistbr/brazilian-ecommerce/home. This data has already been imported. Let’s now explore the revenue data.
View data and compute selected summary stats:
# View data
kable(sample_n(items, 15), caption = "Sample of Items Data")
| order_id | order_item_id | product_id | seller_id | price | freight_value |
|---|---|---|---|---|---|
| 94aa7cffedc2aed45e67a865fe6ec0f9 | 1 | 247fa5b4e2f524a22b21ef256ffc23e4 | 3bdff180c7e1f6551a643b99c265a120 | 29.90 | 15.11 |
| 3777fdee1e90e53636623e02ecccc4ba | 1 | 7715f7da32957a84ad267799121a90e2 | beadbee30901a7f61d031b6b686095ad | 129.99 | 8.17 |
| 7dfd72bed82d922fdde7a5c897353932 | 1 | 9e927cd87b7c76f32f8f42ef2ede7923 | 7299e27ed73d2ad986de7f7c77d919fa | 159.99 | 17.56 |
| a2801b8cd69a7543e074b6c66e0b6e42 | 1 | f71050e65776861c8c3ec7a2d2b0194c | 53e4c6e0f4312d4d2107a8c9cddf45cd | 85.71 | 18.18 |
| 5c3a8f58f95a93d61a60220d61ea851a | 1 | 6c96a2727d7c8e30804367b7341c7b9c | c3867b4666c7d76867627c2f7fb22e21 | 89.00 | 12.12 |
| 1476819dd3e3d7fc87ee304b36af52bb | 1 | 92503fc18ca14a8f32c182ce3f1b9f05 | 3586b8580d9c917874e053a1bb37b5ff | 29.90 | 25.19 |
| 036f684797c77d9beffa596570e198bd | 1 | 1d7db62a21ffb4f220d3e8a05fa77e02 | 643214e62b870443ccbe55ab29a4dccf | 32.00 | 14.10 |
| 182e6edb8cb190cada2690d7c7d78e76 | 1 | 5d66715cc928aadd0074f61332698593 | 128639473a139ac0f3e5f5ade55873a5 | 18.90 | 15.10 |
| 2bdda5cf39975296883f9b09394d0dd2 | 1 | 1b14ba724d4ad3094f074ca9b36aae05 | 8b321bb669392f5163d04c59e235e066 | 12.90 | 7.39 |
| 8c64ba171311b3312bd880905b8da534 | 1 | cac9e5692471a0700418aa3400b9b2b1 | 7ea5bfa6c340f58f8e71fc1f0412b0d6 | 99.49 | 23.20 |
| 7a627462a30fe10f41a14f7f84bd4171 | 1 | 3fbc0ef745950c7932d5f2a446189725 | 06a2c3af7b3aee5d69171b0e14f0ee87 | 64.99 | 27.51 |
| 0d9e44ba18b82a7d8e8a9e61681a3e2a | 1 | 47ccf73e45bf059c94e9d025329206ac | b2ba3715d723d245138f291a6fe42594 | 298.90 | 10.45 |
| c9677a4babf96837ee697317b7963a92 | 1 | cc2232dbef2c9fca23f4c7f6a19a42e3 | 198c7ea11960a9844b544d9bcdca860c | 14.99 | 17.78 |
| 2def71c3b4ee133da4476d4d0ca8baa0 | 1 | 542b3d80712d51eefe3a3cd1c5f4e8fe | 537eb890efff034a88679788b647c564 | 42.90 | 22.67 |
| d043a3bb843a11bb2f3a826771849ec4 | 1 | 683d1efd7b8fb9bbe7e78c7436f27248 | 8e6cc767478edae941d9bd9eb778d77a | 39.90 | 15.56 |
kable(sample_n(orders, 15), caption = "Sample of Orders Data")
| order_id | order_status | order_purchase_timestamp |
|---|---|---|
| cf37324c9d35fb8f5f9d14cd6d48a3a2 | delivered | 2017-05-14 |
| fad32fa5c9e17b0d55a6afb8e54bfbfa | delivered | 2017-06-08 |
| f559ac2258ea5dd75bf9ba503e7bc6e3 | delivered | 2017-02-09 |
| da225d51cdb2a615ac38734cfdaebab4 | delivered | 2017-02-10 |
| 80c7a16d8b2eed5720ce819c547229b9 | delivered | 2017-08-31 |
| 8df58f40fe1df92d0594a2ff4993ab55 | delivered | 2017-06-28 |
| d8cc1cf815465b0812b29a738492e4f9 | delivered | 2017-06-22 |
| 3f6e9699013cd3929f5afc72ee8af297 | delivered | 2018-05-03 |
| 233962c509ebbdb05dacfbe831d15d5b | delivered | 2017-10-11 |
| 6275954c6646fe2a6fcc3b1f7771de04 | delivered | 2017-11-24 |
| 8c2919b1ba90a4a9c174a97fd0cb09aa | delivered | 2017-11-14 |
| 2429083883078c31cf2337ce0dd425ee | unavailable | 2017-07-06 |
| b84c178bc4308555f74ed9cdf3a38193 | delivered | 2018-01-25 |
| 6f118f6a730ce35b230974f295f743f9 | delivered | 2018-01-20 |
| 2f7c478f21ac2d388886deb706e200b3 | delivered | 2017-02-25 |
# View order status
Desc(orders$order_status, plotit = F)
## ------------------------------------------------------------------------------
## orders$order_status (character)
##
## length n NAs unique levels dupes
## 99'441 99'441 0 8 8 y
## 100.0% 0.0%
##
## level freq perc cumfreq cumperc
## 1 delivered 96'478 97.0% 96'478 97.0%
## 2 shipped 1'107 1.1% 97'585 98.1%
## 3 canceled 625 0.6% 98'210 98.8%
## 4 unavailable 609 0.6% 98'819 99.4%
## 5 invoiced 314 0.3% 99'133 99.7%
## 6 processing 301 0.3% 99'434 100.0%
## 7 created 5 0.0% 99'439 100.0%
## 8 approved 2 0.0% 99'441 100.0%
Observations and recommendations:
order_item_id has a bad label. This is not an ID, it is quantity. Should rename.The following code joins the items and orders tables and computes the items totals and commission.
# Join orders and items
j_items <- inner_join(items, orders, by = c("order_id")) %>%
filter(order_status %in% c("delivered", "shipped")) %>% # only include delivered and shipped
mutate(item_total = (order_item_id * price) + freight_value, # Compute items total
comm = item_total * 0.2) # Compute commission
# sample_n(j_items, 20)
glimpse(j_items)
## Rows: 111,382
## Columns: 10
## $ order_id <chr> "00010242fe8c5a6d1ba2dd792cb16214", "00018...
## $ order_item_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ product_id <chr> "4244733e06e7ecb4970a6e2683c13e61", "e5f2d...
## $ seller_id <chr> "48436dade18ac8b2bce089ec2a041202", "dd7dd...
## $ price <dbl> 58.90, 239.90, 199.00, 12.99, 199.90, 21.9...
## $ freight_value <dbl> 13.29, 19.93, 17.87, 12.79, 18.14, 12.69, ...
## $ order_status <chr> "delivered", "delivered", "delivered", "de...
## $ order_purchase_timestamp <date> 2017-09-13, 2017-04-26, 2018-01-14, 2018-...
## $ item_total <dbl> 72.19, 259.83, 216.87, 25.78, 218.04, 34.5...
## $ comm <dbl> 14.438, 51.966, 43.374, 5.156, 43.608, 6.9...
# Clean up workspace
rm(items, orders)
# Plot sales
mutate(j_items, week = floor_date(order_purchase_timestamp, "week")) %>%
group_by(week) %>%
summarise(sales = sum(item_total)/1000) %>%
ggplot(aes(x = week, y = sales)) +
geom_area(color = "blue", fill = "blue", alpha = 0.3) +
labs(title = "Sales by Week", x = NULL, y = "Sales (1,000s BR)")
Observations and recommendations:
The following code computes the sales statistics per seller.
# How much time are we seeing
window <- difftime(max(j_items$order_purchase_timestamp),
min(j_items$order_purchase_timestamp), "days") %>% as.integer()
# Compute summary stats by seller
seller <- group_by(j_items, seller_id) %>%
summarise(min_date = min(order_purchase_timestamp),
max_date = max(order_purchase_timestamp),
date_diff = as.integer(difftime(max_date, min_date, "days")) + 1,
num_orders = length(unique(order_id)),
total_orders = sum(item_total),
total_comm = sum(comm))
kable(sample_n(seller, 15), caption = "Sample of Seller Statistics", digits = 2) # View data
| seller_id | min_date | max_date | date_diff | num_orders | total_orders | total_comm |
|---|---|---|---|---|---|---|
| a5ff20ff766e7f50bd5066f33da14413 | 2018-08-19 | 2018-08-24 | 6 | 2 | 162.37 | 32.47 |
| 54965bbe3e4f07ae045b90b0b8541f52 | 2017-10-18 | 2018-07-03 | 259 | 76 | 13827.15 | 2765.43 |
| 455f46ef09a9e45667e2981df84b5cc2 | 2018-08-07 | 2018-08-07 | 1 | 1 | 48.21 | 9.64 |
| c1849d4d32d7a6cecd5aa471809d3135 | 2018-08-04 | 2018-08-26 | 23 | 3 | 430.71 | 86.14 |
| eb9267cccc90f1b49c8d2f9887c7dd97 | 2018-05-06 | 2018-05-06 | 1 | 1 | 346.27 | 69.25 |
| de722cd6dad950a92b7d4f82673f8833 | 2018-01-09 | 2018-08-23 | 227 | 339 | 73806.05 | 14761.21 |
| afe0067131b73e40875c9b6c10bd2e21 | 2017-02-22 | 2018-06-10 | 474 | 45 | 6151.59 | 1230.32 |
| d93844a9c55ba7ce353388bcf849ea56 | 2017-03-15 | 2018-04-29 | 411 | 12 | 3489.32 | 697.86 |
| 3f2af2670e104d1bcb54022274daeac5 | 2017-09-29 | 2018-05-19 | 233 | 6 | 1423.08 | 284.62 |
| 38874e327ce94d11390b96eb42d61928 | 2016-10-08 | 2017-10-27 | 385 | 6 | 4158.43 | 831.69 |
| 59cea8e446d3834393058e7e0666b6fb | 2018-06-16 | 2018-08-16 | 62 | 10 | 901.14 | 180.23 |
| b1ac6ea7895bc3dd6f0f6f4abbdd2821 | 2017-03-11 | 2017-12-15 | 280 | 39 | 3813.01 | 762.60 |
| 13e85aac53340018b40c2af68001b7e0 | 2016-10-08 | 2016-10-08 | 1 | 1 | 31.92 | 6.38 |
| edc518ba25576a0188a0e8f2c3af186f | 2018-06-30 | 2018-08-20 | 52 | 6 | 2017.74 | 403.55 |
| 5566627eab3fe93fd6ee08798226101a | 2017-04-13 | 2018-01-22 | 285 | 5 | 187.81 | 37.56 |
# Plot total commission
ggplot(seller, aes(x = total_comm)) +
geom_histogram(binwidth = 500, boundary = 0, fill = "blue", alpha = 0.5) +
# coord_cartesian(xlim = c(0, 10000)) +
labs(title = "Distribution of Commission per Seller", subtitle = "Binwidth = 500",
x = "Total Commission ($BR)", y = "Number of Sellers")
Desc(seller$total_comm, plotit = F)
## ------------------------------------------------------------------------------
## seller$total_comm (numeric)
##
## length n NAs unique 0s mean meanCI
## 2'977 2'977 0 2'965 0 1'166.6239 1'035.6067
## 100.0% 0.0% 0.0% 1'297.6412
##
## .05 .10 .25 median .75 .90 .95
## 13.0392 22.4624 60.8360 230.9440 904.1300 2'538.0564 4'469.8688
##
## range sd vcoef mad IQR skew kurt
## 68'707.3780 3'645.8029 3.1251 299.5326 843.2940 9.2361 113.2575
##
## lowest : 3.044, 3.272, 3.712, 3.842, 3.858
## highest: 44'896.6820, 51'068.4880, 51'109.8640, 52'330.3880, 68'710.4220
Observations and recommendations:
The following code computes conversion rates using the mql and deals data.
# Add won variable
mql <- mutate(mql, won = if_else(mql_id %in% deals$mql_id, T, F))
# Conversion rates - landing page
c_landing <- group_by(mql, landing_page_id) %>%
summarise(count = n(),
conv = round((sum(won)/n())*100, 1)) %>%
arrange(desc(conv)) %>%
mutate(landing_page_id = f(landing_page_id))
kable(sample_n(c_landing, 15), caption = "Sample of Landing Page Statistics") # View some stats
| landing_page_id | count | conv |
|---|---|---|
| 5679ae2329cbad78cdf97737d11bad06 | 4 | 0.0 |
| 1694e838dffa97f640fcfa35e86e438e | 12 | 0.0 |
| e2e67cce16ad4e30e1f8962402480f17 | 2 | 50.0 |
| 9496ccbb759ca86ff2b76adb4f7672a6 | 1 | 0.0 |
| 8515f4df7468489ed6bced112f86e360 | 3 | 66.7 |
| 38addfd5b0598e69bfa843b9c85c9a19 | 1 | 0.0 |
| 03c7c0ace395d80182db07ae2c30f034 | 1 | 0.0 |
| b48ec5f3b04e9068441002a19df93c6c | 51 | 13.7 |
| dd6f55db3947b0aeaadb65a610334799 | 16 | 0.0 |
| b16779a74f2b20eb9a17a785519f63a0 | 11 | 18.2 |
| 3aea860a87895eee6b72f666b3d7b98a | 1 | 0.0 |
| e42a14209c69c3e9cc6b042620465f12 | 29 | 3.4 |
| 65c9e96d17d8d12bc0ad3b2cde0bb8b7 | 9 | 11.1 |
| aafc43d934a043b451b1dc5a61943a6b | 8 | 0.0 |
| 3757ed31bc4b61f9e108bd871822c0c6 | 1 | 0.0 |
# Plot histogram
ggplot(c_landing, aes(x = count)) +
geom_histogram(binwidth = 10, boundary = 0, fill = "blue", alpha = 0.5) +
labs(title = "Distribution of Landing Page Sample", subtitle = "Binwidth = 10",
x = "Number in Sample", y = "Count")
Observations and recommendations:
# Plot results
filter(c_landing, count >= 50) %>%
ggplot(aes(x = conv, y = landing_page_id)) +
geom_col(fill = "blue", alpha = 0.5) +
labs(title = "Landing Page by Conversion Rate", subtitle = "Excluding pages with less than 50 trials",
x = "Conversion Rate (%)", y = "Landing Page ID")
Observations and recommendations:
Now let’s explore conversion rates by origin.
# Conversion rates - by origin
c_origin <- filter(mql, !is.na(origin)) %>%
group_by(origin) %>%
summarise(count = n(),
conv = round((sum(won)/n())*100, 1)) %>%
arrange(desc(conv)) %>%
mutate(origin = f(origin))
# Plot results
ggplot(c_origin, aes(x = conv, y = origin)) +
geom_col(fill = "blue", alpha = 0.5) +
labs(title = "Origin by Conversion Rate", x = "Conversion Rate (%)", y = "Traffic Origin")
Observations and recommendations:
unknown performed the best. I would want to investigate what is going on here. Is there a tracking problem? It is curious that these pages performed the best. This implies that there might be something systematic going on. If unkown was random then it would be closer to the average rate.landing_page_id and origin.What can we learn by a closer examination of the closed deals? Let’s try joining the seller stats we computed with the closed deals.
# Merge data
j_deals <- inner_join(deals, seller, by = c("seller_id")) %>%
select(c(1:8, 12, 15:20))
glimpse(j_deals)
## Rows: 378
## Columns: 15
## $ mql_id <chr> "5420aad7fec3549a85876ba1c529bd84", "327174d...
## $ seller_id <chr> "2c43fb513632d29b3b58df74816f1b06", "612170e...
## $ sdr_id <chr> "a8387c01a09e99ce014107505b92388c", "b90f871...
## $ sr_id <chr> "4ef15afb4b2723d8f3d81e51ec7afefe", "6565aa9...
## $ won_date <date> 2018-02-26, 2018-06-05, 2018-07-03, 2018-02...
## $ business_segment <chr> "pet", "home_appliances", "home_appliances",...
## $ lead_type <chr> "online_medium", "online_big", "industry", "...
## $ lead_behaviour_profile <chr> "cat", "cat", "wolf", NA, "cat", "eagle", "c...
## $ business_type <chr> "reseller", "reseller", "manufacturer", "man...
## $ min_date <date> 2018-06-20, 2018-06-11, 2018-07-19, 2018-06...
## $ max_date <date> 2018-07-23, 2018-08-22, 2018-08-27, 2018-06...
## $ date_diff <dbl> 34, 73, 40, 1, 14, 1, 135, 79, 68, 52, 40, 5...
## $ num_orders <int> 3, 107, 15, 1, 2, 1, 5, 8, 2, 14, 3, 20, 565...
## $ total_orders <dbl> 930.17, 25571.68, 923.73, 113.69, 225.70, 31...
## $ total_comm <dbl> 186.034, 5114.336, 184.746, 22.738, 45.140, ...
# Compute commision per order for each persona
comm <- filter(j_deals, lead_behaviour_profile %in% c("cat", "eagle", "wolf", "shark")) %>%
group_by(lead_behaviour_profile) %>%
summarise(count = n(),
comm = sum(total_comm),
comm_per_cust = sum(total_comm) / count) %>%
arrange(desc(comm_per_cust)) %>%
mutate(lead_behaviour_profile = f(lead_behaviour_profile))
# Plot
ggplot(comm, aes(x = lead_behaviour_profile, y = comm_per_cust)) +
geom_col(fill = "blue", alpha = 0.5) + coord_flip() +
labs(title = "Behaviour Profile by Commission",
x = "Persona", y = "Commission (per cust in $BR)")
kable(comm, caption = "Commission Statistics by Persona", digits = 2) # view data
| lead_behaviour_profile | count | comm | comm_per_cust |
|---|---|---|---|
| shark | 10 | 12754.66 | 1275.47 |
| eagle | 50 | 23077.51 | 461.55 |
| cat | 183 | 70025.79 | 382.65 |
| wolf | 43 | 5728.17 | 133.21 |
Observations and recommendations:
The following code finds the top performing landing pages for each persona.
# Compute top 5 pages per persona
p_profile <- inner_join(mql, deals, by = c("mql_id")) %>%
select(mql_id, landing_page_id, lead_behaviour_profile) %>%
filter(lead_behaviour_profile %in% c("cat", "eagle", "wolf", "shark")) %>%
group_by(landing_page_id, lead_behaviour_profile) %>%
summarise(count = n()) %>% ungroup() %>%
group_by(lead_behaviour_profile) %>%
arrange(lead_behaviour_profile, desc(count)) %>%
slice(1:5)
# Plot result
ggplot(p_profile, aes(x = count, y = f(landing_page_id))) +
geom_col(fill = "blue", alpha = 0.5) +
# facet_grid(rows = vars(landing_page_id), cols = vars(lead_behaviour_profile)) +
facet_wrap(vars(lead_behaviour_profile), scales = "free") +
labs(title = "Landing Page by Conversion Success (top 5)",
subtitle = "Pages sorted by overall success",
x = "Number of Conversions", y = "Landing Page ID")
Observations and recommendations:
b76ef... to learn how to appeal to sharks.