An exploration of the Instacart Dataset (2017), which contains a sample of more than 3 million Instacart orders placed by 200K+ Instacart users.
library(readxl)
library(dplyr)
library(ggplot2)
library(knitr)
library(stringr)
library(lubridate)
library(scales)
library(tidyverse)
library(tidyquant)
library(gghighlight)
library(treemap)
# Read In Instacart Data ----
orders <- read.csv("C:/Users/open/Documents/R PROJECTS/orders.csv")
products <- read.csv("C:/Users/open/Documents/R PROJECTS/products.csv")
order_prod_prior <- read.csv("C:/Users/open/Documents/R PROJECTS/order_products__prior.csv")
# Data Frames ----
# Orders - Orders (Updated) has 3.42M rows of data inclusive of Order ID, User ID, Order No, DOW, Time of Day and Days Since Prior Order. Note: DOW is an int, so I will interpret 0 as the Start of the Week, 1 Monday, etc.
# First, I am going to change the numeric days to character DOW.There is no designation, but I will assume '0' is Sunday and so on.
orders_update_tbl <- orders %>%
mutate(order_dow = order_dow +1) %>%
mutate(order_dow = wday(order_dow, label = TRUE))
kable(head(orders_update_tbl,20))
| 2539329 |
1 |
prior |
1 |
Tue |
8 |
NA |
| 2398795 |
1 |
prior |
2 |
Wed |
7 |
15 |
| 473747 |
1 |
prior |
3 |
Wed |
12 |
21 |
| 2254736 |
1 |
prior |
4 |
Thu |
7 |
29 |
| 431534 |
1 |
prior |
5 |
Thu |
15 |
28 |
| 3367565 |
1 |
prior |
6 |
Tue |
7 |
19 |
| 550135 |
1 |
prior |
7 |
Mon |
9 |
20 |
| 3108588 |
1 |
prior |
8 |
Mon |
14 |
14 |
| 2295261 |
1 |
prior |
9 |
Mon |
16 |
0 |
| 2550362 |
1 |
prior |
10 |
Thu |
8 |
30 |
| 1187899 |
1 |
train |
11 |
Thu |
8 |
14 |
| 2168274 |
2 |
prior |
1 |
Tue |
11 |
NA |
| 1501582 |
2 |
prior |
2 |
Fri |
10 |
10 |
| 1901567 |
2 |
prior |
3 |
Mon |
10 |
3 |
| 738281 |
2 |
prior |
4 |
Tue |
10 |
8 |
| 1673511 |
2 |
prior |
5 |
Wed |
11 |
8 |
| 1199898 |
2 |
prior |
6 |
Tue |
9 |
13 |
| 3194192 |
2 |
prior |
7 |
Tue |
12 |
14 |
| 788338 |
2 |
prior |
8 |
Mon |
15 |
27 |
| 1718559 |
2 |
prior |
9 |
Tue |
9 |
8 |
glimpse(orders_update_tbl)
## Rows: 3,421,083
## Columns: 7
## $ order_id <int> 2539329, 2398795, 473747, 2254736, 431534, 3...
## $ user_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,...
## $ eval_set <fct> prior, prior, prior, prior, prior, prior, pr...
## $ order_number <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, ...
## $ order_dow <ord> Tue, Wed, Wed, Thu, Thu, Tue, Mon, Mon, Mon,...
## $ order_hour_of_day <int> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11, 10,...
## $ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, 14, N...
# Products - Products includes Product ID, Product Name ("All-Seasons Salt", "Chocolate Sandwich Cookies"), Aisle ID and Department ID.
kable(head(products,5))
| 1 |
Chocolate Sandwich Cookies |
61 |
19 |
| 2 |
All-Seasons Salt |
104 |
13 |
| 3 |
Robust Golden Unsweetened Oolong Tea |
94 |
7 |
| 4 |
Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce |
38 |
1 |
| 5 |
Green Chile Anytime Sauce |
5 |
13 |
glimpse(products)
## Rows: 49,688
## Columns: 4
## $ product_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ product_name <fct> Chocolate Sandwich Cookies, All-Seasons Salt, Robust ...
## $ aisle_id <int> 61, 104, 94, 38, 5, 11, 98, 116, 120, 115, 31, 119, 1...
## $ department_id <int> 19, 13, 7, 1, 13, 11, 7, 1, 16, 7, 7, 1, 11, 17, 18, ...
# Order_Prod_Prior - Includes Order ID, Product ID, Add to Cart Order Total and then, if it was a reorder for the customer.
kable(head(order_prod_prior,5))
| 2 |
33120 |
1 |
1 |
| 2 |
28985 |
2 |
1 |
| 2 |
9327 |
3 |
0 |
| 2 |
45918 |
4 |
1 |
| 2 |
30035 |
5 |
0 |
glimpse(order_prod_prior)
## Rows: 32,434,489
## Columns: 4
## $ order_id <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3...
## $ product_id <int> 33120, 28985, 9327, 45918, 30035, 17794, 40141, 1...
## $ add_to_cart_order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8...
## $ reordered <int> 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1...
# Analysis ----
# Examining Order Total by Day of Week - Hex Code is Reflective of Instacart's Primary Brand Color
orders_update_tbl %>%
ggplot(aes(x = order_dow))+
geom_bar(stat = "count", fill = "#60ab59") +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Instacart: Orders by Day of Week",
subtitle = "Instacart sees the greatest number of orders on Sunday and Monday, whereas Tuesday - Saturday are relatively in-line with one another.",
x = "",
y = "Orders"
)

# Examining Order Total by Hour - 1
orders_update_tbl %>%
ggplot(aes(x = order_hour_of_day))+
geom_histogram(stat = "count", fill = "#60ab59") +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Instacart: Orders by Hour",
subtitle = "The majority of Instacart orders are placed between 9am and 4pm. Peak hours appear to be 10am and 11am. Order traffic remains steady from 10am - 3pm.",
x = "",
y = "Orders"
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# Examining Order Total by Hour - 2
orders_update_tbl %>%
ggplot(aes(x = order_hour_of_day))+
geom_line(stat = "count", color = "#60ab59", size=2) +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Instacart: Orders by Hour",
subtitle = "The majority of Instacart orders are placed between 9am and 4pm. Peak hours appear to be 10am and 11am. Order volume remains steady from 10am - 3pm.",
x = "",
y = "Orders"
)

# How many days between orders?
orders_update_tbl %>%
ggplot(aes(x = days_since_prior_order))+
geom_histogram(stat = "count", fill = "#60ab59") +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Instacart: Length of Time Between Orders",
subtitle = "A sizeable portion of Instacart shoppers order again within one week of their initial order. However, the data shows that Instacart is seeing the greatest portion of re-orders placed 30 days after the intial order. ",
x = "Days",
y = "Orders"
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).

# Count of Orders?
orders_update_tbl %>%
filter(eval_set == "prior") %>%
count(order_number) %>%
ggplot(aes(x = order_number, n)) +
geom_line(color = "#60ab59", size = 2) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(0, 100, 5),
limits = c(0, 100)) +
labs(
title = "Count of Orders",
subtitle = "The vast majority of Instacart shoppers have placed at least 3-5 orders.",
x = "Count: Prior Orders",
y = ""
)

# How many items are generally purchased with each Instacart order?
average_order_size_tbl <- order_prod_prior %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order))
## `summarise()` ungrouping output (override with `.groups` argument)
average_order_size_tbl %>%
ggplot(aes(x = n_items))+
geom_histogram(stat = "count", fill = "#60ab59") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(0, 60, 5),
limits = c(0, 60)) +
labs(
title = "Instacart: Items Per Order",
subtitle = "Instacart Shoppers generally purchase 5-6 items per order.",
x = "Number of Items",
y = ""
)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 813 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

# Examining Top Products - What items are ordered most often? Interestingly, bananas, organic bananas, organic strawberries, organic baby spinach and organic avocados are among the most ordered grocery items - Healthy!
most_ordered_tbl <- order_prod_prior %>%
group_by(product_id) %>%
summarize(count = n()) %>%
top_n(20, wt = count) %>%
left_join(select(products, product_id, product_name), by = "product_id") %>%
arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
most_ordered_tbl %>%
ggplot(aes(x = reorder(product_name, -count), y = count))+
geom_col(stat = "identity", fill = "#60ab59")+
scale_y_continuous(labels = scales::comma)+
labs(
title = "Top Products",
subtitle = "Bananas, organic strawberries, baby spinach and avocados are among the most ordered products.",
x = "",
y = "Orders"
)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Ignoring unknown parameters: stat

# Most Re-Ordered Products
most_re_ordered_products_tbl <- order_prod_prior %>%
group_by(product_id, reordered) %>%
filter(reordered == 1) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
ungroup() %>%
top_n(10, wt = count)
## `summarise()` regrouping output by 'product_id' (override with `.groups` argument)
most_re_ordered_products_tbl %>%
left_join(select(products, product_id, product_name), by = "product_id") %>%
ggplot(aes(x = reorder(product_name, -count), y = count))+
geom_col(stat = "identity", fill = "#60ab59")+
scale_y_continuous(labels = scales::comma)+
labs(
title = "Top Re-Ordered Products",
subtitle = "Bananas, organic strawberries, baby spinach and avocados are among the most re-ordered products.",
x = "",
y = "Orders"
)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Ignoring unknown parameters: stat

# Top Customers - Reward or Offer as thanks for being a top Instacart user, etc. Instacart has 1,374 users that have placed a minimum of 100 orders.
top_customers_tbl <- orders_update_tbl %>%
group_by(user_id) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
ungroup() %>%
slice_max(count >= 100)
## `summarise()` ungrouping output (override with `.groups` argument)
# Examining Low Frequency - Are there any users who have placed 10 or less orders to date? Target with promotion/offer.
fewer_orders_customer_tbl <- orders_update_tbl %>%
group_by(user_id) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
ungroup() %>%
slice_max(count <= 10)
## `summarise()` ungrouping output (override with `.groups` argument)
# What items should Instacart promote on Sunday - the DOW that the Instacart site sees the most traffic? We can update this to reflect any day of the week or hour of the day.
highlight_items_sunday <- orders_update_tbl %>%
filter(order_dow == "Sun") %>%
subset.data.frame(select = -c(3,6:7)) %>%
left_join(select(order_prod_prior, product_id, order_id), by = "order_id") %>%
left_join(select(products, product_id, product_name), by = "product_id") %>%
group_by(product_name) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
ungroup() %>%
top_n(21, wt = count) %>%
drop_na()
## `summarise()` ungrouping output (override with `.groups` argument)
treemap(highlight_items_sunday,
index = "product_name",
vSize = "count",
title = "",
palette = "Set3",
type = "index")

# What items should Instacart promote at midnight -- Are Instacart users just as health-conscious with their late-night orders as they are throughout the rest of the day?
highlight_items_midnight <- orders_update_tbl %>%
filter(order_hour_of_day == "0") %>%
subset.data.frame(select = -c(3,6:7)) %>%
left_join(select(order_prod_prior, product_id, order_id), by = "order_id") %>%
left_join(select(products, product_id, product_name), by = "product_id") %>%
group_by(product_name) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
ungroup() %>%
top_n(21, wt = count) %>%
drop_na()
## `summarise()` ungrouping output (override with `.groups` argument)
treemap(highlight_items_midnight,
index = "product_name",
vSize = "count",
title = "",
palette = "Set3",
type = "index")

# What items/deals should Instacart promote to Customer 1310 upon authenticating on-site or within the app or by way of a push notifications, etc.
promote_to_customer1310 <- orders_update_tbl %>%
filter(user_id == "1310") %>%
subset.data.frame(select = -c(3,6:7)) %>%
left_join(select(order_prod_prior, product_id, order_id), by = "order_id") %>%
left_join(select(products, product_id, product_name), by = "product_id") %>%
group_by(product_name) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
ungroup() %>%
top_n(5, wt = count) %>%
drop_na()
## `summarise()` ungrouping output (override with `.groups` argument)
treemap(promote_to_customer1310,
index = "product_name",
vSize = "count",
title = "",
palette = "Set3",
type = "index")
