Raw data available at https://www.kaggle.com/c/instacart-market-basket-analysis/data
The dataset for this competition is a relational set of files describing customers’ orders over time. The goal of the competition is to predict which products will be in a user’s next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of day the order was placed, and a relative measure of time between orders.
library(tidyverse)
library(stringr)
library(dtplyr)
library(knitr)
library(data.table)
aisles <- fread("aisles.csv", showProgress = FALSE)
departments <- fread("departments.csv", showProgress = FALSE)
order_products_prior <- fread("order_products__prior.csv", showProgress = FALSE)
order_products_train <- fread("order_products__train.csv", showProgress = FALSE)
orders <- fread("orders.csv", showProgress = FALSE)
products <- fread("products.csv", showProgress = FALSE)
kable(head(products))
product_id | product_name | aisle_id | department_id |
---|---|---|---|
1 | Chocolate Sandwich Cookies | 61 | 19 |
2 | All-Seasons Salt | 104 | 13 |
3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
4 | Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce | 38 | 1 |
5 | Green Chile Anytime Sauce | 5 | 13 |
6 | Dry Nose Oil | 11 | 11 |
summary(products)
## product_id product_name aisle_id department_id
## Min. : 1 Length:49688 Min. : 1.00 Min. : 1.00
## 1st Qu.:12423 Class :character 1st Qu.: 35.00 1st Qu.: 7.00
## Median :24844 Mode :character Median : 69.00 Median :13.00
## Mean :24844 Mean : 67.77 Mean :11.73
## 3rd Qu.:37266 3rd Qu.:100.00 3rd Qu.:17.00
## Max. :49688 Max. :134.00 Max. :21.00
kable(head(orders))
order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
---|---|---|---|---|---|---|
2539329 | 1 | prior | 1 | 2 | 8 | NA |
2398795 | 1 | prior | 2 | 3 | 7 | 15 |
473747 | 1 | prior | 3 | 3 | 12 | 21 |
2254736 | 1 | prior | 4 | 4 | 7 | 29 |
431534 | 1 | prior | 5 | 4 | 15 | 28 |
3367565 | 1 | prior | 6 | 2 | 7 | 19 |
summary(orders)
## order_id user_id eval_set order_number
## Min. : 1 Min. : 1 Length:3421083 Min. : 1.00
## 1st Qu.: 855272 1st Qu.: 51394 Class :character 1st Qu.: 5.00
## Median :1710542 Median :102689 Mode :character Median : 11.00
## Mean :1710542 Mean :102978 Mean : 17.15
## 3rd Qu.:2565812 3rd Qu.:154385 3rd Qu.: 23.00
## Max. :3421083 Max. :206209 Max. :100.00
##
## order_dow order_hour_of_day days_since_prior_order
## Min. :0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.:1.000 1st Qu.:10.00 1st Qu.: 4.00
## Median :3.000 Median :13.00 Median : 7.00
## Mean :2.776 Mean :13.45 Mean :11.11
## 3rd Qu.:5.000 3rd Qu.:16.00 3rd Qu.:15.00
## Max. :6.000 Max. :23.00 Max. :30.00
## NA's :206209
kable(head(aisles))
aisle_id | aisle |
---|---|
1 | prepared soups salads |
2 | specialty cheeses |
3 | energy granola bars |
4 | instant foods |
5 | marinades meat preparation |
6 | other |
summary(aisles)
## aisle_id aisle
## Min. : 1.00 Length:134
## 1st Qu.: 34.25 Class :character
## Median : 67.50 Mode :character
## Mean : 67.50
## 3rd Qu.:100.75
## Max. :134.00
kable(head(departments))
department_id | department |
---|---|
1 | frozen |
2 | other |
3 | bakery |
4 | produce |
5 | alcohol |
6 | international |
summary(departments)
## department_id department
## Min. : 1 Length:21
## 1st Qu.: 6 Class :character
## Median :11 Mode :character
## Mean :11
## 3rd Qu.:16
## Max. :21
kable(head(order_products_prior))
order_id | product_id | add_to_cart_order | reordered |
---|---|---|---|
2 | 33120 | 1 | 1 |
2 | 28985 | 2 | 1 |
2 | 9327 | 3 | 0 |
2 | 45918 | 4 | 1 |
2 | 30035 | 5 | 0 |
2 | 17794 | 6 | 1 |
summary(order_products_prior)
## order_id product_id add_to_cart_order reordered
## Min. : 2 Min. : 1 Min. : 1.000 Min. :0.0000
## 1st Qu.: 855943 1st Qu.:13530 1st Qu.: 3.000 1st Qu.:0.0000
## Median :1711048 Median :25256 Median : 6.000 Median :1.0000
## Mean :1710749 Mean :25576 Mean : 8.351 Mean :0.5897
## 3rd Qu.:2565514 3rd Qu.:37935 3rd Qu.: 11.000 3rd Qu.:1.0000
## Max. :3421083 Max. :49688 Max. :145.000 Max. :1.0000
kable(head(order_products_train))
order_id | product_id | add_to_cart_order | reordered |
---|---|---|---|
1 | 49302 | 1 | 1 |
1 | 11109 | 2 | 1 |
1 | 10246 | 3 | 0 |
1 | 49683 | 4 | 0 |
1 | 43633 | 5 | 1 |
1 | 13176 | 6 | 0 |
summary(order_products_train)
## order_id product_id add_to_cart_order reordered
## Min. : 1 Min. : 1 Min. : 1.000 Min. :0.0000
## 1st Qu.: 843370 1st Qu.:13380 1st Qu.: 3.000 1st Qu.:0.0000
## Median :1701880 Median :25298 Median : 7.000 Median :1.0000
## Mean :1706298 Mean :25556 Mean : 8.758 Mean :0.5986
## 3rd Qu.:2568023 3rd Qu.:37940 3rd Qu.:12.000 3rd Qu.:1.0000
## Max. :3421070 Max. :49688 Max. :80.000 Max. :1.0000
all_orders <- order_products_prior %>%
left_join(orders, by = "order_id") %>%
left_join(products, by = "product_id") %>%
left_join(departments, by = "department_id") %>%
left_join(aisles, by = "aisle_id")
sample_orders <- all_orders[sample(nrow(all_orders), size = 1000000),]
The variable order_number
could either refer to the sequence with which a specific product was purchased by a customer, or it could refer to the sequence with which orders were placed by the customer, including all products. To determine which interpretation was correct, user_id = 1 was selected to inspect. Table 1 shows that this user had two orders, at the same time, with the same order_id, but different product_ids. Since order_id = 10 for both observations, order_number
must refer to the sequence of orders placed by the customer, and not to the sequence of a specific product being purchased.
sample_orders %>%
group_by(user_id) %>%
filter(user_id == 1) %>%
select(user_id, order_id, product_id, order_number) %>%
kable(caption = "Table 1. For user_id = 1, a single order containing two different products where both order_numbers are the same.")
user_id | order_id | product_id | order_number |
---|---|---|---|
1 | 2550362 | 35951 | 10 |
1 | 2550362 | 196 | 10 |
days_since_prior_order
is a measure of how many days it has been since the previous order. It is ambiguous whether this refers to:
Table 2 shows that user_id = 191099 had an order with four products, three of which were not reorders. Since days_since_prior_order = 12 for all products purchased by this customer, and three were not reordered, days_since_prior_order
must measure the number of days since a customers last order, not days since their last order of a specific product.
sample_orders %>%
group_by(order_id) %>%
filter(n() > 3 & !is.na(days_since_prior_order)) %>%
head(4) %>%
select(user_id, order_id, product_id, reordered, days_since_prior_order) %>%
kable(caption = "Table 2. For a single order, all values of days_since_prior_order are equal.")
user_id | order_id | product_id | reordered | days_since_prior_order |
---|---|---|---|---|
191099 | 1035240 | 31486 | 0 | 12 |
191099 | 1035240 | 37262 | 0 | 12 |
191099 | 1035240 | 43643 | 1 | 12 |
191099 | 1035240 | 41844 | 0 | 12 |
order_dow
(day of week), was recoded as week days.sample_orders$order_dow <- as.factor(sample_orders$order_dow)
sample_orders$order_dow <- factor(sample_orders$order_dow,
labels = c("Sunday", "Monday", "Tuesday",
"Wednesday", "Thursday", "Friday", "Saturday"))
The highest number of orders were placed on Saturday, Sunday, and Monday, with Thursday having the fewest orders (Fig. 1).
ggplot(sample_orders, aes(order_dow)) +
geom_bar(fill = "dodgerblue 2", col = "black") +
labs(x = NULL, y = "Orders (thousands)", caption = "Figure 1. Total orders placed by Instacart customers on each day.") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5, size = 20),
axis.text.x = element_text(size = 12, angle = 45, vjust = 0.6),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15),
plot.caption = element_text(size = 12, hjust = 0)) +
scale_y_continuous(labels = function(x) x / 1000)
There is no clear association between the day of the week and the proportion of products which were reordered (Fig. 2).
ggplot(sample_orders, aes(order_dow, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
geom_bar(position = "fill", col = "black") +
labs(x = NULL, y = "Proportion of Orders", fill = "", caption = "Figure 2. Proportion of reorders by Instacart customers on each day.") +
theme_classic() +
scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
theme(plot.title = element_text(hjust = 0.5, size = 20),
axis.text.x = element_text(size = 12, angle = 45, vjust = 0.6),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15),
plot.caption = element_text(size = 12, hjust = 0),
legend.text = element_text(size = 12),
legend.position = "right")
Order counts increase sharply from early morning to 10am, are generally stable until 3pm, then decrease steadily for the rest of the day (Fig. 3).
ggplot(sample_orders, aes(order_hour_of_day)) +
geom_bar(fill = "dodgerblue 2", col = "black") +
labs(x = "", y = "Orders (thousands)", caption = "Figure 3. Total orders placed by Instacart customers during each hour of the day.") +
theme_classic() +
theme(axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15),
plot.caption = element_text(size = 12, hjust = 0)) +
scale_x_continuous(breaks = c(0, 6, 12, 18), labels = c("12am","6am","12pm","6pm")) +
scale_y_continuous(labels = function(x) x / 1000)
There does not appear to be an association between the proportion of reorders and time of day (Fig. 4).
sample_orders %>%
ggplot(aes(order_hour_of_day, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
geom_bar(position = "fill", col = "black") +
labs(x = "", y = "Proportion of Orders", fill = "", caption = "Figure 4. Proportion of reorders by Instacart customers during each hour of the day.") +
theme_classic() +
scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
plot.caption = element_text(size = 12, hjust = 0),
legend.text = element_text(size = 12),
legend.position = "right") +
scale_x_continuous(breaks = c(0, 6, 12, 18), labels = c("12am","6am","12pm","6pm"))
A greater peak in reorders is present at 6am, with a downward trend continuing until midnight (Fig. 5.).
sample_orders %>%
filter(days_since_prior_order == 0) %>%
ggplot(aes(order_hour_of_day, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
geom_bar(position = "fill", col = "black") +
labs(x = "", y = "Proportion of Orders", fill = "", caption = "Figure 5. Proportion of reorders by Instacart customers during each hour of day,
containing only observations where days_since_prior_order = 0.") +
theme_classic() +
scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
plot.caption = element_text(size = 12, hjust = 0),
legend.text = element_text(size = 12),
legend.position = "right") +
scale_x_continuous(breaks = c(0, 6, 12, 18), labels = c("12am","6am","12pm","6pm"))
This indeed is the case. There are large spikes in reordering at day 7 and day 30 (Fig. 6). The variable days_since_prior_order
has a range of 30, so any orders spread apart by greater than 30 days may have been recorded as 30. This would explain the sudden increase in order counts at day 30.
sample_orders %>%
filter(!is.na(days_since_prior_order)) %>%
ggplot(aes(days_since_prior_order)) +
geom_bar(fill = "dodgerblue 2", col = "black") +
labs(x = "Days Since Prior Order", y = "Orders (thousands)", caption = "
Figure 6. Total orders by Instacart customers for each value of days_since_prior_order.") +
theme_classic() +
theme(axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.title.x = element_text(size = 15),
plot.caption = element_text(size = 12)) +
scale_y_continuous(breaks = c(seq(0, 100000, 20000)), labels = function(x) x / 1000)
The proportion of reorders is greatest when days_since_prior_order = 0, with a downward linear trend thereafter (Fig. 7).
sample_orders %>%
filter(!is.na(days_since_prior_order)) %>%
ggplot(aes(days_since_prior_order, fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
geom_bar(position = "fill", col = "black") +
labs(x = "Days Since Prior Order", y = "Proportion of Orders", fill = "", caption = "
Figure 7. Proportion of reorders for each value of days_since_prior_order.") +
theme_classic() +
scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
theme(axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
plot.caption = element_text(size = 12, hjust = 0.3),
legend.text = element_text(size = 12))
Fruits and vegetables, especially those which are organic, had the greatest total number of reorders (Fig. 8).
sample_orders %>%
group_by(product_name) %>%
filter(reordered == 1) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot(aes(factor(reorder(product_name, count)), count)) +
geom_bar(stat = "identity", fill = "darkred", col = "black") +
theme_classic() +
labs(x = NULL, y = "Reorders (thousands)", caption = "
Figure 8. Top 10 products which had the greatest volume of reorders.") +
theme(axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15),
axis.title.x = element_text(size = 15),
plot.caption = element_text(size = 12)) +
coord_flip() +
scale_y_continuous(breaks = (seq(0, 12000, 2000)), labels = function(x) x / 1000)
As can be seen with Table 1, many of the products with the greatest proportion of reorders are dairy products.
sample_orders %>%
group_by(product_name) %>%
summarize(prop_reordered = mean(reordered), count = n()) %>%
filter(count > 50) %>%
arrange(desc(prop_reordered)) %>%
select(product_name, prop_reordered) %>%
head(10) %>%
kable(caption = "Table 3. Top 10 products with the greatest proportion of reorders.")
product_name | prop_reordered |
---|---|
Milk, Organic, Fat Free | 0.8904110 |
Organic Yogurt Original Plain | 0.8852459 |
Goat Milk | 0.8840580 |
Seltzer Water | 0.8823529 |
Chocolate Crepes | 0.8800000 |
Petit Suisse Fruit | 0.8728814 |
Whole Organic Omega 3 Milk | 0.8645833 |
Organic Lactose Free Whole Milk | 0.8588235 |
Milk, Organic, Vitamin D | 0.8539683 |
Organic Reduced Fat 2% Cottage Cheese | 0.8507463 |
Organic products were reordered at a greater proportion compared to non-organic products (Fig. 9).
sample_orders %>%
mutate(is_organic = str_detect(product_name, "^Organic")) %>%
ggplot(aes(x = factor(is_organic, labels = c("Not Organic", "Organic")),
fill = factor(reordered, labels = c("Not Reordered", "Reordered")))) +
geom_bar(position = "fill", col = "black") +
labs(x = NULL, y = "Proportion of Orders", fill = "", caption = "
Figure 9. Proportion of reorders for organic, and non-organic products.") +
theme_classic() +
scale_fill_manual(values = c("dodgerblue 2", "darkred")) +
theme(axis.title.x = element_text(size = 18),
axis.title.y = element_text(size = 15),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
plot.caption = element_text(size = 12, hjust = 0.8),
legend.text = element_text(size = 12))
The produce department had the most reorders, followed by dairy and eggs (Fig. 10).
sample_orders %>%
filter(reordered == 1) %>%
group_by(department, aisle) %>%
count() %>%
ungroup() %>%
arrange(desc(n)) %>%
head(10) %>%
ggplot(aes(reorder(aisle, n), y = n,
fill = reorder(department, desc(n)))) +
geom_bar(stat = "identity", col = "black") +
labs(x = NULL, y = "Reorders (thousands)", fill = "Department
", caption = "
Figure 10. Top 10 aisles which had the greastest volume of reorders,
coloured by department.") +
theme_classic() +
theme(axis.title.x = element_text(size = 15),
axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
plot.caption = element_text(size = 12, hjust = 0.5),
legend.text = element_text(size = 12),
legend.title = element_text(size = 15)) +
scale_y_continuous(breaks = (seq(0, 80000, 20000)), labels = function(x) x / 1000) +
coord_flip()
The milk aisle in the dairy and eggs department had the greatest proportion of reorders (Table 4). From the produce department, only fresh fruits were in the top 10 aisles which had the greatest proportion of reorders.
sample_orders %>%
group_by(product_name) %>%
mutate(prop_reorder = mean(reordered), count = n()) %>%
filter(count > 50) %>%
ungroup() %>%
group_by(department, aisle) %>%
summarize(prop_mean = mean(prop_reorder)) %>%
ungroup() %>%
arrange(desc(prop_mean)) %>%
head(10) %>%
kable(caption = "Table 4. Top 10 aisles with the greatest proportion of reorders.")
department | aisle | prop_mean |
---|---|---|
dairy eggs | milk | 0.7914534 |
other | other | 0.7692308 |
alcohol | spirits | 0.7386364 |
beverages | water seltzer sparkling water | 0.7382553 |
produce | fresh fruits | 0.7251602 |
personal care | digestion | 0.7238095 |
pets | cat food care | 0.7129630 |
dairy eggs | eggs | 0.7081332 |
dairy eggs | soy lactosefree | 0.7043425 |
dairy eggs | yogurt | 0.7021933 |