Instacart: Purchase Analysis

An exploration of the Instacart Dataset (2017), which contains a sample of more than 3 million Instacart orders placed by 200K+ Instacart users.

library(readxl)
library(dplyr)
library(ggplot2)
library(knitr)
library(stringr)
library(lubridate)
library(scales)
library(tidyverse)
library(tidyquant)
library(gghighlight)
library(treemap)

# Read In Instacart Data ----

orders <- read.csv("C:/Users/open/Documents/R PROJECTS/orders.csv")
products <- read.csv("C:/Users/open/Documents/R PROJECTS/products.csv")
order_prod_prior <- read.csv("C:/Users/open/Documents/R PROJECTS/order_products__prior.csv")
# Data Frames ----

# Orders - Orders (Updated) has 3.42M rows of data inclusive of Order ID, User ID, Order No, DOW, Time of Day and Days Since Prior Order. Note: DOW is an int, so I will interpret 0 as the Start of the Week, 1 Monday, etc. 

# First, I am going to change the numeric days to character DOW.There is no designation, but I will assume '0' is Sunday and so on.

orders_update_tbl <- orders %>% 
    mutate(order_dow = order_dow +1) %>% 
    mutate(order_dow  = wday(order_dow, label = TRUE))

kable(head(orders_update_tbl,20))
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
2539329 1 prior 1 Tue 8 NA
2398795 1 prior 2 Wed 7 15
473747 1 prior 3 Wed 12 21
2254736 1 prior 4 Thu 7 29
431534 1 prior 5 Thu 15 28
3367565 1 prior 6 Tue 7 19
550135 1 prior 7 Mon 9 20
3108588 1 prior 8 Mon 14 14
2295261 1 prior 9 Mon 16 0
2550362 1 prior 10 Thu 8 30
1187899 1 train 11 Thu 8 14
2168274 2 prior 1 Tue 11 NA
1501582 2 prior 2 Fri 10 10
1901567 2 prior 3 Mon 10 3
738281 2 prior 4 Tue 10 8
1673511 2 prior 5 Wed 11 8
1199898 2 prior 6 Tue 9 13
3194192 2 prior 7 Tue 12 14
788338 2 prior 8 Mon 15 27
1718559 2 prior 9 Tue 9 8
glimpse(orders_update_tbl)
## Rows: 3,421,083
## Columns: 7
## $ order_id               <int> 2539329, 2398795, 473747, 2254736, 431534, 3...
## $ user_id                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,...
## $ eval_set               <fct> prior, prior, prior, prior, prior, prior, pr...
## $ order_number           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, ...
## $ order_dow              <ord> Tue, Wed, Wed, Thu, Thu, Tue, Mon, Mon, Mon,...
## $ order_hour_of_day      <int> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11, 10,...
## $ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, 14, N...
# Products - Products includes Product ID, Product Name ("All-Seasons Salt", "Chocolate Sandwich Cookies"), Aisle ID and Department ID.

kable(head(products,5))
product_id product_name aisle_id department_id
1 Chocolate Sandwich Cookies 61 19
2 All-Seasons Salt 104 13
3 Robust Golden Unsweetened Oolong Tea 94 7
4 Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce 38 1
5 Green Chile Anytime Sauce 5 13
glimpse(products)
## Rows: 49,688
## Columns: 4
## $ product_id    <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ product_name  <fct> Chocolate Sandwich Cookies, All-Seasons Salt, Robust ...
## $ aisle_id      <int> 61, 104, 94, 38, 5, 11, 98, 116, 120, 115, 31, 119, 1...
## $ department_id <int> 19, 13, 7, 1, 13, 11, 7, 1, 16, 7, 7, 1, 11, 17, 18, ...
# Order_Prod_Prior - Includes Order ID, Product ID, Add to Cart Order Total and then, if it was a reorder for the customer.

kable(head(order_prod_prior,5))
order_id product_id add_to_cart_order reordered
2 33120 1 1
2 28985 2 1
2 9327 3 0
2 45918 4 1
2 30035 5 0
glimpse(order_prod_prior)
## Rows: 32,434,489
## Columns: 4
## $ order_id          <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3...
## $ product_id        <int> 33120, 28985, 9327, 45918, 30035, 17794, 40141, 1...
## $ add_to_cart_order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8...
## $ reordered         <int> 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1...
# Analysis ----

# Examining Order Total by Day of Week - Hex Code is Reflective of Instacart's Primary Brand Color

orders_update_tbl %>% 
    ggplot(aes(x = order_dow))+
    geom_bar(stat = "count", fill = "#60ab59") +
    scale_y_continuous(labels = scales::comma) +
    labs(
        title = "Instacart: Orders by Day of Week",
        subtitle = "Instacart sees the greatest number of orders on Sunday and Monday, whereas Tuesday - Saturday are relatively in-line with one another.",
        x = "",
        y = "Orders"
    )

# Examining Order Total by Hour - 1

orders_update_tbl %>% 
    ggplot(aes(x = order_hour_of_day))+
    geom_histogram(stat = "count", fill = "#60ab59") +
    scale_y_continuous(labels = scales::comma) +
    labs(
        title = "Instacart: Orders by Hour",
        subtitle = "The majority of Instacart orders are placed between 9am and 4pm. Peak hours appear to be 10am and 11am. Order traffic remains steady from 10am - 3pm.",
        x = "",
        y = "Orders"
    )
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# Examining Order Total by Hour - 2

orders_update_tbl %>% 
    ggplot(aes(x = order_hour_of_day))+
    geom_line(stat = "count", color = "#60ab59", size=2) +
    scale_y_continuous(labels = scales::comma) +
    labs(
        title = "Instacart: Orders by Hour",
        subtitle = "The majority of Instacart orders are placed between 9am and 4pm. Peak hours appear to be 10am and 11am. Order volume remains steady from 10am - 3pm.",
        x = "",
        y = "Orders"
    )

# How many days between orders? 

orders_update_tbl %>% 
    ggplot(aes(x = days_since_prior_order))+
    geom_histogram(stat = "count", fill = "#60ab59") +
    scale_y_continuous(labels = scales::comma) +
    labs(
        title = "Instacart: Length of Time Between Orders",
        subtitle = "A sizeable portion of Instacart shoppers order again within one week of their initial order. However, the data shows that Instacart is seeing the greatest portion of re-orders placed 30 days after the intial order. ",
        x = "Days",
        y = "Orders"
    )
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).

# Count of Orders? 

orders_update_tbl %>% 
    filter(eval_set == "prior") %>% 
    count(order_number) %>% 
    ggplot(aes(x = order_number, n)) +
    geom_line(color = "#60ab59", size = 2) +
    scale_y_continuous(labels = scales::comma) +
    scale_x_continuous(breaks = seq(0, 100, 5),
                       limits = c(0, 100)) +
    labs(
        title = "Count of Orders",
        subtitle = "The vast majority of Instacart shoppers have placed at least 3-5 orders.",
        x = "Count: Prior Orders",
        y = ""
    )

# How many items are generally purchased with each Instacart order?

average_order_size_tbl <- order_prod_prior %>% 
    group_by(order_id) %>% 
    summarize(n_items = last(add_to_cart_order))
## `summarise()` ungrouping output (override with `.groups` argument)
average_order_size_tbl %>%
    ggplot(aes(x = n_items))+
    geom_histogram(stat = "count", fill = "#60ab59") +
    scale_y_continuous(labels = scales::comma) +
    scale_x_continuous(breaks = seq(0, 60, 5),
                       limits = c(0, 60)) +
    labs(
        title = "Instacart: Items Per Order",
        subtitle = "Instacart Shoppers generally purchase 5-6 items per order.",
        x = "Number of Items",
        y = ""
    )
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 813 rows containing non-finite values (stat_count).
## Warning: Removed 1 rows containing missing values (geom_bar).

# Examining Top Products - What items are ordered most often? Interestingly, bananas, organic bananas, organic strawberries, organic baby spinach and organic avocados are among the most ordered grocery items - Healthy!

most_ordered_tbl <- order_prod_prior %>% 
    group_by(product_id) %>%
    summarize(count = n()) %>% 
    top_n(20, wt = count) %>%
    left_join(select(products, product_id, product_name), by = "product_id") %>%
    arrange(desc(count))
## `summarise()` ungrouping output (override with `.groups` argument)
most_ordered_tbl %>% 
    ggplot(aes(x = reorder(product_name, -count), y = count))+
    geom_col(stat = "identity", fill = "#60ab59")+
    scale_y_continuous(labels = scales::comma)+
    labs(
        title = "Top Products",
        subtitle = "Bananas, organic strawberries, baby spinach and avocados are among the most ordered products.",
        x = "",
        y = "Orders"
    )+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Ignoring unknown parameters: stat

# Most Re-Ordered Products

most_re_ordered_products_tbl <- order_prod_prior %>% 
    group_by(product_id, reordered) %>% 
    filter(reordered == 1) %>% 
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    ungroup() %>% 
    top_n(10, wt = count)
## `summarise()` regrouping output by 'product_id' (override with `.groups` argument)
most_re_ordered_products_tbl %>% 
    left_join(select(products, product_id, product_name), by = "product_id") %>% 
    ggplot(aes(x = reorder(product_name, -count), y = count))+
    geom_col(stat = "identity", fill = "#60ab59")+
    scale_y_continuous(labels = scales::comma)+
    labs(
        title = "Top Re-Ordered Products",
        subtitle = "Bananas, organic strawberries, baby spinach and avocados are among the most re-ordered products.",
        x = "",
        y = "Orders"
    )+
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Ignoring unknown parameters: stat

# Top Customers - Reward or Offer as thanks for being a top Instacart user, etc. Instacart has 1,374 users that have placed a minimum of 100 orders.

top_customers_tbl <- orders_update_tbl %>% 
    group_by(user_id) %>% 
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    ungroup() %>% 
    slice_max(count >= 100)
## `summarise()` ungrouping output (override with `.groups` argument)
# Examining Low Frequency - Are there any users who have placed 10 or less orders to date? Target with promotion/offer.

fewer_orders_customer_tbl <- orders_update_tbl %>% 
    group_by(user_id) %>% 
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    ungroup() %>% 
    slice_max(count <= 10)
## `summarise()` ungrouping output (override with `.groups` argument)
# What items should Instacart promote on Sunday - the DOW that the Instacart site sees the most traffic? We can update this to reflect any day of the week or hour of the day.

highlight_items_sunday <- orders_update_tbl %>% 
    filter(order_dow == "Sun") %>% 
    subset.data.frame(select = -c(3,6:7)) %>% 
    left_join(select(order_prod_prior, product_id, order_id), by = "order_id") %>%
    left_join(select(products, product_id, product_name), by = "product_id") %>% 
    group_by(product_name) %>%
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    ungroup() %>%
    top_n(21, wt = count) %>% 
    drop_na()
## `summarise()` ungrouping output (override with `.groups` argument)
treemap(highlight_items_sunday,
        index = "product_name",
        vSize = "count",
        title = "",
        palette = "Set3",
        type = "index")

# What items should Instacart promote at midnight -- Are Instacart users just as health-conscious with their late-night orders as they are throughout the rest of the day?

highlight_items_midnight <- orders_update_tbl %>% 
    filter(order_hour_of_day == "0") %>% 
    subset.data.frame(select = -c(3,6:7)) %>% 
    left_join(select(order_prod_prior, product_id, order_id), by = "order_id") %>%
    left_join(select(products, product_id, product_name), by = "product_id") %>% 
    group_by(product_name) %>%
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    ungroup() %>%
    top_n(21, wt = count) %>% 
    drop_na()
## `summarise()` ungrouping output (override with `.groups` argument)
treemap(highlight_items_midnight,
        index = "product_name",
        vSize = "count",
        title = "",
        palette = "Set3",
        type = "index")

# What items/deals should Instacart promote to Customer 1310 upon authenticating on-site or within the app or by way of a push notifications, etc. 

promote_to_customer1310 <- orders_update_tbl %>% 
    filter(user_id == "1310") %>% 
    subset.data.frame(select = -c(3,6:7)) %>% 
    left_join(select(order_prod_prior, product_id, order_id), by = "order_id") %>%
    left_join(select(products, product_id, product_name), by = "product_id") %>% 
    group_by(product_name) %>%
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    ungroup() %>%
    top_n(5, wt = count) %>% 
    drop_na()
## `summarise()` ungrouping output (override with `.groups` argument)
treemap(promote_to_customer1310,
        index = "product_name",
        vSize = "count",
        title = "",
        palette = "Set3",
        type = "index")