Avant take home exercise - Q4 selling widgets

Wen He - Product Analyst - 2018.8.12

Summary:

(Apologize for not having enough time to detailize the visualization and summarize every aspects - please refer to detail data table below)

1.Tuesday and Sunday have the highest total sales among other days of the week, while Friday has the lowest

2.the 15th hour of the day has the highest sales / customers, while the 12th, 1st and 20th have the lowest.

3.WI has the highest total average waiting time while NE and LA have the lowest

3.LA has the highest total sales while WI has the lowest

5.Segment members into 20 demi-decile groups based on the average waiting time between start_time and submit_time to apply different strategies for better customer shopping experience

6.For example, quantile is basically segment id, which means members in quantile 1 have the highest waiting time between start and submit

library load

library(tidyverse)
library(lubridate)
library(ggplot2)
library(DT)

import data and check if there’s data cleaning issues

widget <- read.csv('Analytics_TakeHome_Q4_DataSet.csv',header = TRUE)
head(widget)
##   order_id          start_time      submitted_time customer_id
## 1        1 2015-01-02 19:06:39 2015-01-02 19:23:43          83
## 2        2 2015-01-03 01:45:24 2015-01-03 01:50:09          62
## 3        3 2015-01-03 02:18:25 2015-01-03 02:29:21          65
## 4        4 2015-01-03 03:17:06 2015-01-03 03:32:21          11
## 5        5 2015-01-03 09:04:21 2015-01-03 09:11:39          64
## 6        6 2015-01-03 21:52:16 2015-01-03 22:21:20          16
##   destination_state product_id price quantity
## 1                TX          5 77.84       99
## 2                NE          1 49.14       18
## 3                LA          4 13.94      171
## 4                NE          1 47.27       24
## 5                NE          5 73.92       82
## 6                IA          1 50.50       20
str(widget)
## 'data.frame':    6225 obs. of  8 variables:
##  $ order_id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ start_time       : Factor w/ 6225 levels "2015-01-02 19:06:39",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ submitted_time   : Factor w/ 6225 levels "2015-01-02 19:23:43",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ customer_id      : int  83 62 65 11 64 16 42 85 50 86 ...
##  $ destination_state: Factor w/ 7 levels "IA","IL","LA",..: 6 5 3 5 5 1 5 4 1 6 ...
##  $ product_id       : int  5 1 4 1 5 1 2 2 5 5 ...
##  $ price            : num  77.8 49.1 13.9 47.3 73.9 ...
##  $ quantity         : int  99 18 171 24 82 20 206 205 80 112 ...
table(is.na(widget))
## 
## FALSE 
## 49800

Transfer time from factor into timestamp

widget$start_time <- ymd_hms(widget$start_time)
widget$submitted_time <- ymd_hms(widget$submitted_time)

Alter data table by adding calculated fields

widget$time_diff <- as.period(interval(widget$start_time, widget$submitted_time), unit = "second")
widget$tt_sales  <- widget$price*widget$quantity         
widget$year <- year(widget$submitted_time)
widget$week <- week(widget$submitted_time)
widget$month <- month(widget$submitted_time)
widget$dayofweek <- wday(widget$submitted_time)
widget$hour <- hour(widget$submitted_time)
head(widget)
##   order_id          start_time      submitted_time customer_id
## 1        1 2015-01-02 19:06:39 2015-01-02 19:23:43          83
## 2        2 2015-01-03 01:45:24 2015-01-03 01:50:09          62
## 3        3 2015-01-03 02:18:25 2015-01-03 02:29:21          65
## 4        4 2015-01-03 03:17:06 2015-01-03 03:32:21          11
## 5        5 2015-01-03 09:04:21 2015-01-03 09:11:39          64
## 6        6 2015-01-03 21:52:16 2015-01-03 22:21:20          16
##   destination_state product_id price quantity time_diff tt_sales year week
## 1                TX          5 77.84       99     1024S  7706.16 2015    1
## 2                NE          1 49.14       18      285S   884.52 2015    1
## 3                LA          4 13.94      171      656S  2383.74 2015    1
## 4                NE          1 47.27       24      915S  1134.48 2015    1
## 5                NE          5 73.92       82      438S  6061.44 2015    1
## 6                IA          1 50.50       20     1744S  1010.00 2015    1
##   month dayofweek hour
## 1     1         6   19
## 2     1         7    1
## 3     1         7    2
## 4     1         7    3
## 5     1         7    9
## 6     1         7   22

Basic data summarize and visualization

# By year view
year_view <-
  widget %>% 
  group_by(year) %>% 
  summarise(
    total_distinct_customer = n_distinct(widget$customer_id) ,
    total_distinct_order = n_distinct(order_id),
    total_quantity = sum(quantity),
    total_sales = sum(tt_sales),
    sales_per_customer = total_sales / total_distinct_customer,
    sales_per_order = total_sales / total_distinct_order,
    quantity_per_customer = total_sales / total_distinct_customer,
    quantity_per_order = total_sales/ total_distinct_order,
    avg_waiting_time = mean(time_diff)
  )

datatable(year_view)
# By month view
month_view <-
widget %>% 
  group_by(month) %>% 
  summarise(
    total_distinct_customer = n_distinct(widget$customer_id) ,
    total_distinct_order = n_distinct(order_id),
    total_quantity = sum(quantity),
    total_sales = sum(tt_sales),
    sales_per_customer = total_sales / total_distinct_customer,
    sales_per_order = total_sales / total_distinct_order,
    quantity_per_customer = total_sales / total_distinct_customer,
    quantity_per_order = total_sales/ total_distinct_order,
    avg_waiting_time = mean(time_diff)
  )

datatable(month_view)
# By week view
week_view <-datatable (
widget %>% 
  group_by(week) %>% 
  summarise(
    total_distinct_customer = n_distinct(widget$customer_id) ,
    total_distinct_order = n_distinct(order_id),
    total_quantity = sum(quantity),
    total_sales = sum(tt_sales),
    sales_per_customer = total_sales / total_distinct_customer,
    sales_per_order = total_sales / total_distinct_order,
    quantity_per_customer = total_sales / total_distinct_customer,
    quantity_per_order = total_sales/ total_distinct_order,
    avg_waiting_time = mean(time_diff)
  )
 )

week_view
# by day of week view
# Tuesday and Sunday have the highest total sales among other days of the week, while Friday has the lowest
doweek_view <-

widget %>% 
  group_by(dayofweek) %>% 
  summarise(
    total_distinct_customer = n_distinct(widget$customer_id) ,
    total_distinct_order = n_distinct(order_id),
    total_quantity = sum(quantity),
    total_sales = sum(tt_sales),
    sales_per_customer = total_sales / total_distinct_customer,
    sales_per_order = total_sales / total_distinct_order,
    quantity_per_customer = total_sales / total_distinct_customer,
    quantity_per_order = total_sales/ total_distinct_order,
    avg_waiting_time = mean(time_diff)
  )
 

 datatable (doweek_view)
y <- ggplot(data = doweek_view, aes(x = dayofweek, y = total_sales)) 
y + geom_line(linetype = "dashed") + geom_point()

# by hour view
# the 15th hour of the day has the highest sales / customers, while the 12th, 1st and 20th have the lowest.
hour_view <-

widget %>% 
  group_by(hour) %>% 
  summarise(
    total_distinct_customer = n_distinct(widget$customer_id) ,
    total_distinct_order = n_distinct(order_id),
    total_quantity = sum(quantity),
    total_sales = sum(tt_sales),
    sales_per_customer = total_sales / total_distinct_customer,
    sales_per_order = total_sales / total_distinct_order,
    quantity_per_customer = total_sales / total_distinct_customer,
    quantity_per_order = total_sales/ total_distinct_order,
    avg_waiting_time = mean(time_diff)
  )


datatable(hour_view)
h <- ggplot(data = hour_view, aes(x = hour, y = sales_per_customer)) 
h + geom_line() + geom_point()

# by state view
# WI has the highest total average waiting time while NE and LA have the lowest
# LA has the highest total sales while WI has the lowest
state_view <-
widget %>% 
  group_by(destination_state) %>% 
  summarise(
    total_distinct_customer = n_distinct(widget$customer_id) ,
    total_distinct_order = n_distinct(order_id),
    total_quantity = sum(quantity),
    total_sales = sum(tt_sales),
    sales_per_customer = total_sales / total_distinct_customer,
    sales_per_order = total_sales / total_distinct_order,
    quantity_per_customer = total_sales / total_distinct_customer,
    quantity_per_order = total_sales/ total_distinct_order,
    avg_waiting_time = mean(time_diff)
  )

datatable(state_view)
m<-ggplot(data=state_view, aes(destination_state,avg_waiting_time, label = avg_waiting_time))
m + geom_point(aes(color = destination_state),size = 3) + 
  coord_flip() +
  theme_classic() +
  xlab("destination_state")+
  ylab("avg_waiting_time(s)")

k<-ggplot(data=state_view, aes(destination_state,total_sales, label = total_sales))
k + geom_point(aes(color = destination_state),size = 5) + 
  coord_flip() +
  theme_classic() +
  xlab("destination_state")+
  ylab("total_sales(s)")

Segment members into 20 demi-decile groups based on the average waiting time between start_time and submit_time to apply different strategies for better customer shopping experience

For example, quantile is basically segment id, which means members in quantile 1 have the highest waiting time between start and submit

customer <- widget[,c(1,4,9)]
customer_diff <- customer %>% 
  group_by(customer_id) %>% 
  summarise(avg_time_diff = mean(time_diff))

customer_diff$rank <- rank(-customer_diff$avg_time_diff,ties.method = "average")
customer_diff_rank <- customer_diff %>% 
  mutate(quantile = ntile(rank,20)) %>% 
  arrange(desc(quantile))

datatable(customer_diff_rank)