Avant take home exercise - Q4 selling widgets
Wen He - Product Analyst - 2018.8.12
Summary:
(Apologize for not having enough time to detailize the visualization and summarize every aspects - please refer to detail data table below)
1.Tuesday and Sunday have the highest total sales among other days of the week, while Friday has the lowest
2.the 15th hour of the day has the highest sales / customers, while the 12th, 1st and 20th have the lowest.
3.WI has the highest total average waiting time while NE and LA have the lowest
3.LA has the highest total sales while WI has the lowest
5.Segment members into 20 demi-decile groups based on the average waiting time between start_time and submit_time to apply different strategies for better customer shopping experience
6.For example, quantile is basically segment id, which means members in quantile 1 have the highest waiting time between start and submit
library load
library(tidyverse)
library(lubridate)
library(ggplot2)
library(DT)
import data and check if there’s data cleaning issues
widget <- read.csv('Analytics_TakeHome_Q4_DataSet.csv',header = TRUE)
head(widget)
## order_id start_time submitted_time customer_id
## 1 1 2015-01-02 19:06:39 2015-01-02 19:23:43 83
## 2 2 2015-01-03 01:45:24 2015-01-03 01:50:09 62
## 3 3 2015-01-03 02:18:25 2015-01-03 02:29:21 65
## 4 4 2015-01-03 03:17:06 2015-01-03 03:32:21 11
## 5 5 2015-01-03 09:04:21 2015-01-03 09:11:39 64
## 6 6 2015-01-03 21:52:16 2015-01-03 22:21:20 16
## destination_state product_id price quantity
## 1 TX 5 77.84 99
## 2 NE 1 49.14 18
## 3 LA 4 13.94 171
## 4 NE 1 47.27 24
## 5 NE 5 73.92 82
## 6 IA 1 50.50 20
str(widget)
## 'data.frame': 6225 obs. of 8 variables:
## $ order_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ start_time : Factor w/ 6225 levels "2015-01-02 19:06:39",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ submitted_time : Factor w/ 6225 levels "2015-01-02 19:23:43",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ customer_id : int 83 62 65 11 64 16 42 85 50 86 ...
## $ destination_state: Factor w/ 7 levels "IA","IL","LA",..: 6 5 3 5 5 1 5 4 1 6 ...
## $ product_id : int 5 1 4 1 5 1 2 2 5 5 ...
## $ price : num 77.8 49.1 13.9 47.3 73.9 ...
## $ quantity : int 99 18 171 24 82 20 206 205 80 112 ...
table(is.na(widget))
##
## FALSE
## 49800
Transfer time from factor into timestamp
widget$start_time <- ymd_hms(widget$start_time)
widget$submitted_time <- ymd_hms(widget$submitted_time)
Alter data table by adding calculated fields
widget$time_diff <- as.period(interval(widget$start_time, widget$submitted_time), unit = "second")
widget$tt_sales <- widget$price*widget$quantity
widget$year <- year(widget$submitted_time)
widget$week <- week(widget$submitted_time)
widget$month <- month(widget$submitted_time)
widget$dayofweek <- wday(widget$submitted_time)
widget$hour <- hour(widget$submitted_time)
head(widget)
## order_id start_time submitted_time customer_id
## 1 1 2015-01-02 19:06:39 2015-01-02 19:23:43 83
## 2 2 2015-01-03 01:45:24 2015-01-03 01:50:09 62
## 3 3 2015-01-03 02:18:25 2015-01-03 02:29:21 65
## 4 4 2015-01-03 03:17:06 2015-01-03 03:32:21 11
## 5 5 2015-01-03 09:04:21 2015-01-03 09:11:39 64
## 6 6 2015-01-03 21:52:16 2015-01-03 22:21:20 16
## destination_state product_id price quantity time_diff tt_sales year week
## 1 TX 5 77.84 99 1024S 7706.16 2015 1
## 2 NE 1 49.14 18 285S 884.52 2015 1
## 3 LA 4 13.94 171 656S 2383.74 2015 1
## 4 NE 1 47.27 24 915S 1134.48 2015 1
## 5 NE 5 73.92 82 438S 6061.44 2015 1
## 6 IA 1 50.50 20 1744S 1010.00 2015 1
## month dayofweek hour
## 1 1 6 19
## 2 1 7 1
## 3 1 7 2
## 4 1 7 3
## 5 1 7 9
## 6 1 7 22
Basic data summarize and visualization
# By year view
year_view <-
widget %>%
group_by(year) %>%
summarise(
total_distinct_customer = n_distinct(widget$customer_id) ,
total_distinct_order = n_distinct(order_id),
total_quantity = sum(quantity),
total_sales = sum(tt_sales),
sales_per_customer = total_sales / total_distinct_customer,
sales_per_order = total_sales / total_distinct_order,
quantity_per_customer = total_sales / total_distinct_customer,
quantity_per_order = total_sales/ total_distinct_order,
avg_waiting_time = mean(time_diff)
)
datatable(year_view)
# By month view
month_view <-
widget %>%
group_by(month) %>%
summarise(
total_distinct_customer = n_distinct(widget$customer_id) ,
total_distinct_order = n_distinct(order_id),
total_quantity = sum(quantity),
total_sales = sum(tt_sales),
sales_per_customer = total_sales / total_distinct_customer,
sales_per_order = total_sales / total_distinct_order,
quantity_per_customer = total_sales / total_distinct_customer,
quantity_per_order = total_sales/ total_distinct_order,
avg_waiting_time = mean(time_diff)
)
datatable(month_view)
# By week view
week_view <-datatable (
widget %>%
group_by(week) %>%
summarise(
total_distinct_customer = n_distinct(widget$customer_id) ,
total_distinct_order = n_distinct(order_id),
total_quantity = sum(quantity),
total_sales = sum(tt_sales),
sales_per_customer = total_sales / total_distinct_customer,
sales_per_order = total_sales / total_distinct_order,
quantity_per_customer = total_sales / total_distinct_customer,
quantity_per_order = total_sales/ total_distinct_order,
avg_waiting_time = mean(time_diff)
)
)
week_view
# by day of week view
# Tuesday and Sunday have the highest total sales among other days of the week, while Friday has the lowest
doweek_view <-
widget %>%
group_by(dayofweek) %>%
summarise(
total_distinct_customer = n_distinct(widget$customer_id) ,
total_distinct_order = n_distinct(order_id),
total_quantity = sum(quantity),
total_sales = sum(tt_sales),
sales_per_customer = total_sales / total_distinct_customer,
sales_per_order = total_sales / total_distinct_order,
quantity_per_customer = total_sales / total_distinct_customer,
quantity_per_order = total_sales/ total_distinct_order,
avg_waiting_time = mean(time_diff)
)
datatable (doweek_view)
y <- ggplot(data = doweek_view, aes(x = dayofweek, y = total_sales))
y + geom_line(linetype = "dashed") + geom_point()

# by hour view
# the 15th hour of the day has the highest sales / customers, while the 12th, 1st and 20th have the lowest.
hour_view <-
widget %>%
group_by(hour) %>%
summarise(
total_distinct_customer = n_distinct(widget$customer_id) ,
total_distinct_order = n_distinct(order_id),
total_quantity = sum(quantity),
total_sales = sum(tt_sales),
sales_per_customer = total_sales / total_distinct_customer,
sales_per_order = total_sales / total_distinct_order,
quantity_per_customer = total_sales / total_distinct_customer,
quantity_per_order = total_sales/ total_distinct_order,
avg_waiting_time = mean(time_diff)
)
datatable(hour_view)
h <- ggplot(data = hour_view, aes(x = hour, y = sales_per_customer))
h + geom_line() + geom_point()

# by state view
# WI has the highest total average waiting time while NE and LA have the lowest
# LA has the highest total sales while WI has the lowest
state_view <-
widget %>%
group_by(destination_state) %>%
summarise(
total_distinct_customer = n_distinct(widget$customer_id) ,
total_distinct_order = n_distinct(order_id),
total_quantity = sum(quantity),
total_sales = sum(tt_sales),
sales_per_customer = total_sales / total_distinct_customer,
sales_per_order = total_sales / total_distinct_order,
quantity_per_customer = total_sales / total_distinct_customer,
quantity_per_order = total_sales/ total_distinct_order,
avg_waiting_time = mean(time_diff)
)
datatable(state_view)
m<-ggplot(data=state_view, aes(destination_state,avg_waiting_time, label = avg_waiting_time))
m + geom_point(aes(color = destination_state),size = 3) +
coord_flip() +
theme_classic() +
xlab("destination_state")+
ylab("avg_waiting_time(s)")

k<-ggplot(data=state_view, aes(destination_state,total_sales, label = total_sales))
k + geom_point(aes(color = destination_state),size = 5) +
coord_flip() +
theme_classic() +
xlab("destination_state")+
ylab("total_sales(s)")

Segment members into 20 demi-decile groups based on the average waiting time between start_time and submit_time to apply different strategies for better customer shopping experience
For example, quantile is basically segment id, which means members in quantile 1 have the highest waiting time between start and submit
customer <- widget[,c(1,4,9)]
customer_diff <- customer %>%
group_by(customer_id) %>%
summarise(avg_time_diff = mean(time_diff))
customer_diff$rank <- rank(-customer_diff$avg_time_diff,ties.method = "average")
customer_diff_rank <- customer_diff %>%
mutate(quantile = ntile(rank,20)) %>%
arrange(desc(quantile))
datatable(customer_diff_rank)