Instacart Market Basket Analysis

Which products will an Instacart consumer purchase again?

The objective of this Kaggle competition is to use the anonymized data on customer orders over time to predict which previously purchased products will be in a user’s next order.

Solution Approach

At high level, the idea is to come up with features related to reorder metrics for individual products as well as for user preferences for products. Also, we create orders based on the orders of the users.

E.g Everyone likes say an apple iphone, so the individual product’s metrics should capture the quality of a product getting reordered on it’s merit. Then, Mr. A likes some exotic food rarely liked by anyone else, the reorder metrics for Mr. A should capture this liking and the reordering metrics specific to him. Other metrics based on orders placed would include frequency of ordering etc., time (day of week/hour of the day) preferences etc.

Once the features are created a denormalized structure is created with the keys as the user id and product id. The problem then becomes a CLASSIFICATION problem that needs to be solved using a classfier algorithm. We choose XGBoost as the classifier.

Load the data

# Load Data ---------------------------------------------------------------
library(data.table)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.3.3
path <- "./input"

aisles <- fread(file.path(path, "aisles.csv"))
departments <- fread(file.path(path, "departments.csv"))
or_pr_prior <- fread(file.path(path, "order_products__prior.csv"))
## 
Read 0.0% of 32434489 rows
Read 11.2% of 32434489 rows
Read 21.6% of 32434489 rows
Read 32.8% of 32434489 rows
Read 43.4% of 32434489 rows
Read 54.3% of 32434489 rows
Read 65.6% of 32434489 rows
Read 76.5% of 32434489 rows
Read 86.6% of 32434489 rows
Read 97.1% of 32434489 rows
Read 32434489 rows and 4 (of 4) columns from 0.538 GB file in 00:00:22
or_pr_train <- fread(file.path(path, "order_products__train.csv"))
orders <- fread(file.path(path, "orders.csv"))
## 
Read 0.0% of 3421083 rows
Read 32.4% of 3421083 rows
Read 63.7% of 3421083 rows
Read 91.8% of 3421083 rows
Read 3421083 rows and 7 (of 7) columns from 0.101 GB file in 00:00:13
products <- fread(file.path(path, "products.csv"))
  1. Reshape data.
# Reshape data -----------------------------------------------------------------
aisles$aisle <- as.factor(aisles$aisle)
departments$department <- as.factor(departments$department)
orders$eval_set <- as.factor(orders$eval_set)
products$product_name <- as.factor(products$product_name)

or_pr_train$user_id <- orders$user_id[match(or_pr_train$order_id, orders$order_id)]
  1. Create reorder metrics for the PRODUCTS which are part of prior orders.
# Products ----------------------------------------------------------------
total_prior_orders <- nrow(orders[orders$eval_set=="prior",])

or_pr_prior <- or_pr_prior %>% 
               inner_join(products, by= "product_id" ) 

or_pr_prior$product_name <- NULL

#Metrics on products
product_metrics<- or_pr_prior %>%
                        group_by(product_id) %>%
                        summarise(pr_total_orders = n(),
                                  pr_total_orders_ratio = n()/total_prior_orders,
                                  pr_mean_add_to_cart = mean(add_to_cart_order),
                                  pr_reordered_times = sum(reordered) 
                          )
                           
product_metrics$pr_reordered_ratio = product_metrics$pr_reordered_times / 
                                                product_metrics$pr_total_orders
  1. User Metrics
  1. User Metrics - Create user metrics based on orders and then on products contained in those orders.
# User Metrics -------------------------------------------------------------------
orders$order_dow_hod <- orders$order_dow * 24 + orders$order_hour_of_day

user_metrics <- orders %>% 
                filter(eval_set == "prior") %>%
                group_by(user_id) %>%
                summarise(
                        user_total_orders = max(order_number),
                        user_mean_dow = mean(order_dow),
                        user_mean_hod = mean(order_hour_of_day),
                        user_mean_dow_hod = mean(order_dow_hod),
                        user_order_frequency = mean(days_since_prior_order, na.rm=T)
                )

test_train_orders <-  orders %>% 
                        filter(eval_set != "prior") %>%
                        select(user_id, order_id, eval_set, days_since_prior_order)                                

user_metrics <- user_metrics %>%
                        inner_join(test_train_orders)
## Joining, by = "user_id"
#User Metrics - Contd...--------------------------------------------------------

or_pr_prior <- or_pr_prior %>%
        inner_join(orders, by = "order_id")

user_metrics2 <- or_pr_prior %>%
                group_by(user_id) %>%
                summarise(
                        user_total_products =n(),
                        user_distinct_products = n_distinct(product_id),
                        user_total_pr_reorders = sum(reordered)
                )

user_metrics2$user_pr_reorder_ratio = user_metrics2$user_total_pr_reorders/ 
                                                user_metrics2$user_total_products
  1. Create user and product combination metrics.
user_product_metrics <- or_pr_prior %>%
                        group_by(user_id, product_id) %>%
                        summarise(up_total_orders = n(),
                                  up_mean_add_to_cart= mean(add_to_cart_order),        
                                  up_total_reorders = sum(reordered)
                                 ) 

rm(or_pr_prior)
gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   503573  26.9   13897102  742.2  17371378  927.8
## Vcells 64400052 491.4  382768864 2920.3 395131472 3014.7
user_product_metrics <- user_product_metrics %>% 
        inner_join(product_metrics, by= "product_id") %>%
        inner_join(user_metrics, by= "user_id") %>%
        inner_join(user_metrics2, by= "user_id")

rm(products, aisles, departments,product_metrics, user_metrics, user_metrics2)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    453986   24.3    8894144  475.0  17371378  927.8
## Vcells 234687244 1790.6  460818311 3515.8 457005925 3486.7
# Few more features...                     
user_product_metrics$up_ttlOrd_ttlusrOrd_ratio = user_product_metrics$up_total_orders / user_product_metrics$user_total_orders
user_product_metrics$up_ttlOrd_ttlprOrd_ratio = user_product_metrics$up_total_orders / user_product_metrics$pr_total_orders
user_product_metrics$up_ATC_pr_ATC_ratio = user_product_metrics$up_mean_add_to_cart / user_product_metrics$pr_mean_add_to_cart
user_product_metrics$up_reorder_ratio = user_product_metrics$up_total_reorders / user_product_metrics$user_total_orders
user_product_metrics$user_total_pr_reorder_ratio <- user_product_metrics$user_total_pr_reorders/
                                                user_product_metrics$user_total_products
  1. Seperate data into training and test sets.
train <- user_product_metrics[user_product_metrics$eval_set == "train",]
test <- user_product_metrics[user_product_metrics$eval_set == "test",]

train <- train %>% 
        left_join(or_pr_train %>% select(user_id, product_id, reordered), 
                  by = c("user_id", "product_id"))
train$reordered[is.na(train$reordered)] <- 0

train$eval_set <- NULL
train$user_id <- NULL
train$product_id <- NULL
train$order_id <- NULL

test$eval_set <- NULL
test$user_id <- NULL
test$reordered <- NULL

rm(or_pr_train, orders, test_train_orders, user_product_metrics)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    529517   28.3    3643040  194.6  17371378  927.8
## Vcells 269480067 2056.0  796585240 6077.5 795299587 6067.7
  1. Build Model.
# Model -------------------------------------------------------------------
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.3.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
params <- list(
        "objective"           = "reg:logistic",
        "eval_metric"         = "logloss",
        "eta"                 = 0.1,
        "max_depth"           = 6,
        "min_child_weight"    = 10,
        "gamma"               = 0.70,
        "subsample"           = 0.77,
        "colsample_bytree"    = 0.95,
        "alpha"               = 2e-05,
        "lambda"              = 10
)

train <- as.data.frame(train)
test <- as.data.frame(test)

subtrain <- train # %>% sample_frac(0.1)
X <- xgb.DMatrix(as.matrix(subtrain %>% select(-reordered)), label = subtrain$reordered)
model <- xgboost(data = X, params = params, nrounds = 30)
## [16:45:48] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
## [1]  train-logloss:0.626692 
## [2]  train-logloss:0.572394 
## [3]  train-logloss:0.527354 
## [4]  train-logloss:0.489616 
## [5]  train-logloss:0.457775 
## [6]  train-logloss:0.430683 
## [7]  train-logloss:0.407531 
## [8]  train-logloss:0.387669 
## [9]  train-logloss:0.370661 
## [10] train-logloss:0.355941 
## [11] train-logloss:0.343220 
## [12] train-logloss:0.332231 
## [13] train-logloss:0.322721 
## [14] train-logloss:0.314470 
## [15] train-logloss:0.307318 
## [16] train-logloss:0.301129 
## [17] train-logloss:0.295785 
## [18] train-logloss:0.291139 
## [19] train-logloss:0.287103 
## [20] train-logloss:0.283617 
## [21] train-logloss:0.280594 
## [22] train-logloss:0.277968 
## [23] train-logloss:0.275712 
## [24] train-logloss:0.273748 
## [25] train-logloss:0.272049 
## [26] train-logloss:0.270587 
## [27] train-logloss:0.269329 
## [28] train-logloss:0.268236 
## [29] train-logloss:0.267305 
## [30] train-logloss:0.266493
importance <- xgb.importance(colnames(X), model = model)
xgb.ggplot.importance(importance)

rm(X, importance, subtrain)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   1301045   69.5    3643040  194.6  17371378  927.8
## Vcells 267323654 2039.6  796585240 6077.5 795299587 6067.7
  1. Make predictions.
# Apply model -------------------------------------------------------------
X <- xgb.DMatrix(as.matrix(test %>% select(-order_id, -product_id)))
test$reordered <- predict(model, X)

test$reordered <- (test$reordered > 0.21) * 1

submission <- test %>%
        filter(reordered == 1) %>%
        group_by(order_id) %>%
        summarise(
                products = paste(product_id, collapse = " ")
        )

missing <- data.frame(
        order_id = unique(test$order_id[!test$order_id %in% submission$order_id]),
        products = "None"
)

submission <- submission %>% bind_rows(missing) %>% arrange(order_id)
write.csv(submission, file = "submit.csv", row.names = F)