Raw data available at https://www.kaggle.com/c/instacart-market-basket-analysis/data

Competition Description

 

The dataset for this competition is a relational set of files describing customers’ orders over time. The goal of the competition is to predict which products will be in a user’s next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of day the order was placed, and a relative measure of time between orders.

 

Required packages were loaded.
library(tidyverse)
library(stringr)
library(dtplyr)
library(knitr)
library(data.table)
Datasets were imported from working directory.
aisles <- fread("aisles.csv", showProgress = FALSE)
departments <- fread("departments.csv", showProgress = FALSE)
order_products_prior <- fread("order_products__prior.csv", showProgress = FALSE)
order_products_train <- fread("order_products__train.csv", showProgress = FALSE)
orders <- fread("orders.csv", showProgress = FALSE)
products <- fread("products.csv", showProgress = FALSE)

Structure of Datasets

products

kable(head(products))
product_id product_name aisle_id department_id
1 Chocolate Sandwich Cookies 61 19
2 All-Seasons Salt 104 13
3 Robust Golden Unsweetened Oolong Tea 94 7
4 Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce 38 1
5 Green Chile Anytime Sauce 5 13
6 Dry Nose Oil 11 11
summary(products)
##    product_id    product_name          aisle_id      department_id  
##  Min.   :    1   Length:49688       Min.   :  1.00   Min.   : 1.00  
##  1st Qu.:12423   Class :character   1st Qu.: 35.00   1st Qu.: 7.00  
##  Median :24844   Mode  :character   Median : 69.00   Median :13.00  
##  Mean   :24844                      Mean   : 67.77   Mean   :11.73  
##  3rd Qu.:37266                      3rd Qu.:100.00   3rd Qu.:17.00  
##  Max.   :49688                      Max.   :134.00   Max.   :21.00

orders

kable(head(orders))
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
2539329 1 prior 1 2 8 NA
2398795 1 prior 2 3 7 15
473747 1 prior 3 3 12 21
2254736 1 prior 4 4 7 29
431534 1 prior 5 4 15 28
3367565 1 prior 6 2 7 19
summary(orders)
##     order_id          user_id         eval_set          order_number   
##  Min.   :      1   Min.   :     1   Length:3421083     Min.   :  1.00  
##  1st Qu.: 855272   1st Qu.: 51394   Class :character   1st Qu.:  5.00  
##  Median :1710542   Median :102689   Mode  :character   Median : 11.00  
##  Mean   :1710542   Mean   :102978                      Mean   : 17.15  
##  3rd Qu.:2565812   3rd Qu.:154385                      3rd Qu.: 23.00  
##  Max.   :3421083   Max.   :206209                      Max.   :100.00  
##                                                                        
##    order_dow     order_hour_of_day days_since_prior_order
##  Min.   :0.000   Min.   : 0.00     Min.   : 0.00         
##  1st Qu.:1.000   1st Qu.:10.00     1st Qu.: 4.00         
##  Median :3.000   Median :13.00     Median : 7.00         
##  Mean   :2.776   Mean   :13.45     Mean   :11.11         
##  3rd Qu.:5.000   3rd Qu.:16.00     3rd Qu.:15.00         
##  Max.   :6.000   Max.   :23.00     Max.   :30.00         
##                                    NA's   :206209

aisles

kable(head(aisles))
aisle_id aisle
1 prepared soups salads
2 specialty cheeses
3 energy granola bars
4 instant foods
5 marinades meat preparation
6 other
summary(aisles)
##     aisle_id         aisle          
##  Min.   :  1.00   Length:134        
##  1st Qu.: 34.25   Class :character  
##  Median : 67.50   Mode  :character  
##  Mean   : 67.50                     
##  3rd Qu.:100.75                     
##  Max.   :134.00

departments

kable(head(departments))
department_id department
1 frozen
2 other
3 bakery
4 produce
5 alcohol
6 international
summary(departments)
##  department_id  department       
##  Min.   : 1    Length:21         
##  1st Qu.: 6    Class :character  
##  Median :11    Mode  :character  
##  Mean   :11                      
##  3rd Qu.:16                      
##  Max.   :21

order_products_prior

kable(head(order_products_prior))
order_id product_id add_to_cart_order reordered
2 33120 1 1
2 28985 2 1
2 9327 3 0
2 45918 4 1
2 30035 5 0
2 17794 6 1
summary(order_products_prior)
##     order_id         product_id    add_to_cart_order   reordered     
##  Min.   :      2   Min.   :    1   Min.   :  1.000   Min.   :0.0000  
##  1st Qu.: 855943   1st Qu.:13530   1st Qu.:  3.000   1st Qu.:0.0000  
##  Median :1711048   Median :25256   Median :  6.000   Median :1.0000  
##  Mean   :1710749   Mean   :25576   Mean   :  8.351   Mean   :0.5897  
##  3rd Qu.:2565514   3rd Qu.:37935   3rd Qu.: 11.000   3rd Qu.:1.0000  
##  Max.   :3421083   Max.   :49688   Max.   :145.000   Max.   :1.0000

order_products_train

kable(head(order_products_train))
order_id product_id add_to_cart_order reordered
1 49302 1 1
1 11109 2 1
1 10246 3 0
1 49683 4 0
1 43633 5 1
1 13176 6 0
summary(order_products_train)
##     order_id         product_id    add_to_cart_order   reordered     
##  Min.   :      1   Min.   :    1   Min.   : 1.000    Min.   :0.0000  
##  1st Qu.: 843370   1st Qu.:13380   1st Qu.: 3.000    1st Qu.:0.0000  
##  Median :1701880   Median :25298   Median : 7.000    Median :1.0000  
##  Mean   :1706298   Mean   :25556   Mean   : 8.758    Mean   :0.5986  
##  3rd Qu.:2568023   3rd Qu.:37940   3rd Qu.:12.000    3rd Qu.:1.0000  
##  Max.   :3421070   Max.   :49688   Max.   :80.000    Max.   :1.0000

 

 

Merge Datasets & Sample

 

The datasets are relational, so they were combined into a single table.
all_orders <- order_products_prior %>% 
    left_join(orders, by = "order_id") %>% 
    left_join(products, by = "product_id") %>% 
    left_join(departments, by = "department_id") %>% 
    left_join(aisles, by = "aisle_id")
A random sample of 1 000 000 rows was taken to reduce computation time.
sample_orders <- all_orders[sample(nrow(all_orders), size = 1000000),]

 

Understanding the Data

 

The variable order_number could either refer to the sequence with which a specific product was purchased by a customer, or it could refer to the sequence with which orders were placed by the customer, including all products. To determine which interpretation was correct, user_id = 1 was selected to inspect. Table 1 shows that this user had two orders, at the same time, with the same order_id, but different product_ids. Since order_id = 10 for both observations, order_number must refer to the sequence of orders placed by the customer, and not to the sequence of a specific product being purchased.

sample_orders %>% 
    group_by(user_id) %>% 
    filter(user_id == 1) %>% 
    select(user_id, order_id, product_id, order_number) %>% 
    kable(caption = "Table 1. For user_id = 1, a single order containing two different products where both order_numbers are the same.")
Table 1. For user_id = 1, a single order containing two different products where both order_numbers are the same.
user_id order_id product_id order_number
1 2550362 35951 10
1 2550362 196 10

 

days_since_prior_order is a measure of how many days it has been since the previous order. It is ambiguous whether this refers to:

  1. Days since prior order of this product, or
  2. Days since prior order by this customer.

Table 2 shows that user_id = 191099 had an order with four products, three of which were not reorders. Since days_since_prior_order = 12 for all products purchased by this customer, and three were not reordered, days_since_prior_order must measure the number of days since a customers last order, not days since their last order of a specific product.

sample_orders %>% 
    group_by(order_id) %>% 
    filter(n() > 3 & !is.na(days_since_prior_order)) %>% 
    head(4) %>% 
    select(user_id, order_id, product_id, reordered, days_since_prior_order) %>% 
    kable(caption = "Table 2. For a single order, all values of days_since_prior_order are equal.")
Table 2. For a single order, all values of days_since_prior_order are equal.
user_id order_id product_id reordered days_since_prior_order
191099 1035240 31486 0 12
191099 1035240 37262 0 12
191099 1035240 43643 1 12
191099 1035240 41844 0 12

 

Exploratory Analysis

 

The variable order_dow (day of week), was recoded as week days.
sample_orders$order_dow <- as.factor(sample_orders$order_dow)
sample_orders$order_dow <- factor(sample_orders$order_dow, 
                                  labels = c("Sunday", "Monday", "Tuesday", 
                                             "Wednesday", "Thursday", "Friday", "Saturday"))

 

Are there more orders placed on some days than others?

 

The highest number of orders were placed on Saturday, Sunday, and Monday, with Thursday having the fewest orders (Fig. 1).

ggplot(sample_orders, aes(order_dow)) +
    geom_bar(fill = "dodgerblue 2", col = "black") +
    labs(x = NULL, y = "Orders (thousands)", caption = "Figure 1. Total orders placed by Instacart customers on each day.") +
    theme_classic() +
    theme(plot.title = element_text(hjust = 0.5, size = 20), 
          axis.text.x = element_text(size = 12, angle = 45, vjust = 0.6),
          axis.text.y = element_text(size = 12),
          axis.title.y = element_text(size = 15),
          plot.caption = element_text(size = 12, hjust = 0)) +
    scale_y_continuous(labels = function(x) x / 1000)

 

Is there an association between the proportion of products reordered and the day of week in which the order was placed?

 

There is no clear association between the day of the week and the proportion of products which were reordered (Fig. 2).

ggplot(sample_orders, aes(order_dow, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
    geom_bar(position = "fill", col = "black") +
    labs(x = NULL, y = "Proportion of Orders", fill = "", caption = "Figure 2. Proportion of reorders by Instacart customers on each day.") +
    theme_classic() +
    scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
    theme(plot.title = element_text(hjust = 0.5, size = 20), 
          axis.text.x =  element_text(size = 12, angle = 45, vjust = 0.6),
          axis.text.y = element_text(size = 12),
          axis.title.y = element_text(size = 15),
          plot.caption = element_text(size = 12, hjust = 0),
          legend.text = element_text(size = 12),
          legend.position = "right")

 

Does order volume change throughout the day?

 

Order counts increase sharply from early morning to 10am, are generally stable until 3pm, then decrease steadily for the rest of the day (Fig. 3).

ggplot(sample_orders, aes(order_hour_of_day)) +
    geom_bar(fill = "dodgerblue 2", col = "black") +
    labs(x = "", y = "Orders (thousands)", caption = "Figure 3. Total orders placed by Instacart customers during each hour of the day.") +
    theme_classic() +
    theme(axis.text.x = element_text(size = 15),
          axis.text.y = element_text(size = 12),
          axis.title.y = element_text(size = 15),
          plot.caption = element_text(size = 12, hjust = 0)) +
    scale_x_continuous(breaks = c(0, 6, 12, 18), labels = c("12am","6am","12pm","6pm")) +
    scale_y_continuous(labels = function(x) x / 1000)

 

Does the proportion of reorders change throughout the day?

 

There does not appear to be an association between the proportion of reorders and time of day (Fig. 4).

sample_orders %>% 
    ggplot(aes(order_hour_of_day, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
        geom_bar(position = "fill", col = "black") +
        labs(x = "", y = "Proportion of Orders", fill = "", caption = "Figure 4. Proportion of reorders by Instacart customers during each hour of the day.") +
        theme_classic() +
        scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
        theme(axis.title.x = element_text(size = 18),
              axis.title.y = element_text(size = 15),
              axis.text.x = element_text(size = 15),
              axis.text.y = element_text(size = 12),
          plot.caption = element_text(size = 12, hjust = 0),
          legend.text = element_text(size = 12),
          legend.position = "right") +
          scale_x_continuous(breaks = c(0, 6, 12, 18), labels = c("12am","6am","12pm","6pm"))

 

Does this relationship change if only observations where days_since_prior_order = 0 are included?

 

A greater peak in reorders is present at 6am, with a downward trend continuing until midnight (Fig. 5.).

sample_orders %>% 
    filter(days_since_prior_order == 0) %>% 
    ggplot(aes(order_hour_of_day, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
        geom_bar(position = "fill", col = "black") +
        labs(x = "", y = "Proportion of Orders", fill = "", caption = "Figure 5. Proportion of reorders by Instacart customers during each hour of day, 
               containing only observations where days_since_prior_order = 0.") +
        theme_classic() +
        scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
        theme(axis.title.x = element_text(size = 18),
              axis.title.y = element_text(size = 15),
              axis.text.x = element_text(size = 15),
              axis.text.y = element_text(size = 12),
          plot.caption = element_text(size = 12, hjust = 0),
          legend.text = element_text(size = 12),
          legend.position = "right") +
          scale_x_continuous(breaks = c(0, 6, 12, 18), labels = c("12am","6am","12pm","6pm"))

 

Do customers place orders at regular intervals of time, such as every week or every month?

 

This indeed is the case. There are large spikes in reordering at day 7 and day 30 (Fig. 6). The variable days_since_prior_order has a range of 30, so any orders spread apart by greater than 30 days may have been recorded as 30. This would explain the sudden increase in order counts at day 30.

sample_orders %>% 
    filter(!is.na(days_since_prior_order)) %>% 
    ggplot(aes(days_since_prior_order)) +
        geom_bar(fill = "dodgerblue 2", col = "black") +
        labs(x = "Days Since Prior Order", y = "Orders (thousands)", caption = "
             Figure 6. Total orders by Instacart customers for each value of days_since_prior_order.") +
        theme_classic() +
        theme(axis.text.x = element_text(size = 15),
            axis.text.y = element_text(size = 12),
            axis.title.y = element_text(size = 15),
            axis.title.x = element_text(size = 15),
            plot.caption = element_text(size = 12)) +
        scale_y_continuous(breaks = c(seq(0, 100000, 20000)), labels = function(x) x / 1000)

 

Is their an association between the number of days since prior order and the proportion of reorders?

 

The proportion of reorders is greatest when days_since_prior_order = 0, with a downward linear trend thereafter (Fig. 7).

sample_orders %>% 
    filter(!is.na(days_since_prior_order)) %>% 
    ggplot(aes(days_since_prior_order, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
        geom_bar(position = "fill", col = "black") +
        labs(x = "Days Since Prior Order", y = "Proportion of Orders", fill = "", caption = "
             Figure 7. Proportion of reorders for each value of days_since_prior_order.") +
        theme_classic() +
        scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
        theme(axis.title.x = element_text(size = 15),
              axis.title.y = element_text(size = 15),
              axis.text.x = element_text(size = 15),
              axis.text.y = element_text(size = 12),
          plot.caption = element_text(size = 12, hjust = 0.3),
          legend.text = element_text(size = 12))

 

Which products had the highest volume of reorders?

 

Fruits and vegetables, especially those which are organic, had the greatest total number of reorders (Fig. 8).

sample_orders %>% 
    group_by(product_name) %>% 
    filter(reordered == 1) %>% 
    summarize(count = n()) %>% 
    arrange(desc(count)) %>% 
    head(10) %>% 
    ggplot(aes(factor(reorder(product_name, count)), count)) +
        geom_bar(stat = "identity", fill = "darkred", col = "black") +
        theme_classic() +
        labs(x = NULL, y = "Reorders (thousands)", caption = "
             Figure 8. Top 10 products which had the greatest volume of reorders.") +
        theme(axis.text.x = element_text(size = 15),
            axis.text.y = element_text(size = 12),
            axis.title.y = element_text(size = 15),
            axis.title.x = element_text(size = 15),
            plot.caption = element_text(size = 12)) +
        coord_flip() +
        scale_y_continuous(breaks = (seq(0, 12000, 2000)), labels = function(x) x / 1000)

 

Which products had the greatest proportion of reorders?

 

As can be seen with Table 1, many of the products with the greatest proportion of reorders are dairy products.

sample_orders %>% 
    group_by(product_name) %>% 
    summarize(prop_reordered = mean(reordered), count = n()) %>% 
    filter(count > 50) %>% 
    arrange(desc(prop_reordered)) %>% 
    select(product_name, prop_reordered) %>% 
    head(10) %>% 
    kable(caption = "Table 3. Top 10 products with the greatest proportion of reorders.")
Table 3. Top 10 products with the greatest proportion of reorders.
product_name prop_reordered
Milk, Organic, Fat Free 0.8904110
Organic Yogurt Original Plain 0.8852459
Goat Milk 0.8840580
Seltzer Water 0.8823529
Chocolate Crepes 0.8800000
Petit Suisse Fruit 0.8728814
Whole Organic Omega 3 Milk 0.8645833
Organic Lactose Free Whole Milk 0.8588235
Milk, Organic, Vitamin D 0.8539683
Organic Reduced Fat 2% Cottage Cheese 0.8507463

 

Is there an association between organic food and proportion reordered?

 

Organic products were reordered at a greater proportion compared to non-organic products (Fig. 9).

sample_orders %>% 
    mutate(is_organic = str_detect(product_name, "^Organic")) %>% 
    ggplot(aes(x = factor(is_organic, labels = c("Not Organic", "Organic")), 
               fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
        geom_bar(position = "fill", col = "black") +
        labs(x = NULL, y = "Proportion of Orders", fill = "", caption = "
             Figure 9. Proportion of reorders for organic, and non-organic products.") +
        theme_classic() +
        scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
        theme(axis.title.x = element_text(size = 18),
              axis.title.y = element_text(size = 15),
              axis.text.x = element_text(size = 15),
              axis.text.y = element_text(size = 12),
          plot.caption = element_text(size = 12, hjust = 0.8),
          legend.text = element_text(size = 12))

 

Do some aisles or departments have more reorderes than others?

 

The produce department had the most reorders, followed by dairy and eggs (Fig. 10).

sample_orders %>% 
    filter(reordered == 1) %>% 
    group_by(department, aisle) %>% 
    count() %>% 
    ungroup() %>%
    arrange(desc(n)) %>% 
    head(10) %>% 
    ggplot(aes(reorder(aisle, n), y = n, 
               fill = reorder(department, desc(n)))) +
        geom_bar(stat = "identity", col = "black") +
        labs(x = NULL, y = "Reorders (thousands)", fill = "Department
", caption = "
             Figure 10. Top 10 aisles which had the greastest volume of reorders, 
coloured by department.") +
        theme_classic() +
        theme(axis.title.x = element_text(size = 15),
              axis.text.x = element_text(size = 15),
              axis.text.y = element_text(size = 12),
          plot.caption = element_text(size = 12, hjust = 0.5),
          legend.text = element_text(size = 12),
          legend.title = element_text(size = 15)) +
        scale_y_continuous(breaks = (seq(0, 80000, 20000)), labels = function(x) x / 1000) +
        coord_flip() 

 

Do the same aisles also have the greatest proportion of reorders?

 

The milk aisle in the dairy and eggs department had the greatest proportion of reorders (Table 4). From the produce department, only fresh fruits were in the top 10 aisles which had the greatest proportion of reorders.

sample_orders %>% 
    group_by(product_name) %>% 
    mutate(prop_reorder = mean(reordered), count = n()) %>% 
    filter(count > 50) %>% 
    ungroup() %>% 
    group_by(department, aisle) %>% 
    summarize(prop_mean = mean(prop_reorder)) %>% 
    ungroup() %>% 
    arrange(desc(prop_mean)) %>% 
    head(10) %>% 
    kable(caption = "Table 4. Top 10 aisles with the greatest proportion of reorders.")
Table 4. Top 10 aisles with the greatest proportion of reorders.
department aisle prop_mean
dairy eggs milk 0.7914534
other other 0.7692308
alcohol spirits 0.7386364
beverages water seltzer sparkling water 0.7382553
produce fresh fruits 0.7251602
personal care digestion 0.7238095
pets cat food care 0.7129630
dairy eggs eggs 0.7081332
dairy eggs soy lactosefree 0.7043425
dairy eggs yogurt 0.7021933