Introducton

The purpose of this study is to analyze customer cart and predict their buying trend, as well as which products will be in customer cart during customer’s next order.

The dataset provides information on 3 million customer record on their cart items. This is the combined raw customer data.

################################ Loading the required packages ################################
library(DT)         #To display scrollable tables in r markdown
library(data.table) #For import using fread
library(dplyr)      #For data manipulation
library(tidyr)      #For getting the data in tidy format
library(lubridate)  #For extracting and working with dates
library(stringr)    #For working of strings
library(ggplot2) ## visualize data
library(janitor) # fF
library(cowplot)# Combines easily multiple plots
library(knitr) ## present table in webpage
library(shiny) ## develop shiny app
# Load Data 
path <- "C:/Users/Sanjiv/Desktop/instaCart/dataset"

aisles <- fread(file.path(path, "aisles.csv"))
departments <- fread(file.path(path, "departments.csv"))
or_pr_prior <- fread(file.path(path, "order_products__prior.csv"))
or_pr_train <- fread(file.path(path, "order_products__train.csv"))
orders <- fread(file.path(path, "orders.csv"))
products <- fread(file.path(path, "products.csv"))

Reshape data

To build the xgb algorithm and create new ones, you need to prepare sets. We take the following steps. Some columns in different data frames have character types, and this is a problem in xgboost, which controls only numeric vectors. Therefore, we turn symbolic columns into factors. Create a new “products” table via internal_join () We delete tables “aisles” and “departments” because we do not need them. We create the “user_id” column in the ordert table after matching “order_id” of this table with “order_id” in the “order” table. Create a new “orders_products” table, which contains tables “orders” and “orderp”.

aisles$aisle <- as.factor(aisles$aisle)
departments$department <- as.factor(departments$department)
orders$eval_set <- as.factor(orders$eval_set)
products$product_name <- as.factor(products$product_name)

or_pr_train$user_id <- orders$user_id[match(or_pr_train$order_id, orders$order_id)]

# Products -- Create reorder metrics for the PRODUCTS which are part of prior orders
total_prior_orders <- nrow(orders[orders$eval_set=="prior",])

or_pr_prior <- or_pr_prior %>% 
  inner_join(products, by= "product_id" ) 

or_pr_prior$product_name <- NULL
#Metrics on products
product_metrics<- or_pr_prior %>%
  group_by(product_id) %>%
  summarise(pr_total_orders = n(),
            pr_total_orders_ratio = n()/total_prior_orders,
            pr_mean_add_to_cart = mean(add_to_cart_order),
            pr_reordered_times = sum(reordered) 
  )

product_metrics$pr_reordered_ratio = product_metrics$pr_reordered_times / 
  product_metrics$pr_total_orders

Daily and weekly shopping habits

Hour of day:

  orders %>% 
  ggplot(aes(x=order_hour_of_day)) + 
  geom_histogram(stat="count",fill="skyblue")

# Day of Week:

    orders %>% 
    ggplot(aes(x=order_dow)) + 
    geom_histogram(stat="count",fill="skyblue")

Although from the given dataset the weekdays are not clear, we can clearly see that there is a lot of shopping occured during regular work hours.

When do they purchase again?

    orders %>% 
      ggplot(aes(x=days_since_prior_order)) + 
      geom_histogram(stat="count",fill="skyblue")

How many items do people buy?

This seems to follow a Poisson Distribution.

    or_pr_prior %>% 
      group_by(order_id) %>% 
      summarize(n_items = last(add_to_cart_order)) %>%
      ggplot(aes(x=n_items))+
      geom_histogram(stat="count",fill="skyblue") + 
      geom_rug()+
      coord_cartesian(xlim=c(0,80))

Top reorder proportion

Although there is some overlap (notably bananas and milk), a lot of the high frequency re-orders are more niche products like special kinds of milk and bread.

Top 10

    tmp <-or_pr_prior %>% 
      group_by(product_id) %>% 
      summarize(proportion_reordered = mean(reordered), n=n()) %>% 
      filter(n>40) %>% 
      top_n(10,wt=proportion_reordered) %>% 
      arrange(desc(proportion_reordered)) %>% 
      left_join(products,by="product_id")
    kable(tmp)  
product_id proportion_reordered n product_name aisle_id department_id
6433 0.941176 68 Raw Veggie Wrappers 13 20
2075 0.931034 87 Serenity Ultimate Extrema Overnight Pads 126 11
27740 0.920792 101 Chocolate Love Bar 45 19
13875 0.911111 45 Simply Sleep Nighttime Sleep Aid 6 2
31418 0.900000 60 Sparking Water 115 7
35604 0.900000 100 Maca Buttercups 45 19
36543 0.895522 67 Bars Peanut Butter 88 13
26093 0.893939 66 Soy Crisps Lightly Salted 107 19
38251 0.891892 111 Benchbreak Chardonnay 62 5
36801 0.885417 96 Organic Blueberry B Mega 31 7
# Top 20
    tmp <-or_pr_prior %>% 
      group_by(product_id) %>% 
      summarize(proportion_reordered = mean(reordered), n=n()) %>% 
      filter(n>40) %>% 
      top_n(10,wt=proportion_reordered) %>% 
      arrange(desc(proportion_reordered)) %>% 
      left_join(products,by="product_id")
    kable(tmp) 
product_id proportion_reordered n product_name aisle_id department_id
6433 0.941176 68 Raw Veggie Wrappers 13 20
2075 0.931034 87 Serenity Ultimate Extrema Overnight Pads 126 11
27740 0.920792 101 Chocolate Love Bar 45 19
13875 0.911111 45 Simply Sleep Nighttime Sleep Aid 6 2
31418 0.900000 60 Sparking Water 115 7
35604 0.900000 100 Maca Buttercups 45 19
36543 0.895522 67 Bars Peanut Butter 88 13
26093 0.893939 66 Soy Crisps Lightly Salted 107 19
38251 0.891892 111 Benchbreak Chardonnay 62 5
36801 0.885417 96 Organic Blueberry B Mega 31 7
kable(tmp) 
product_id proportion_reordered n product_name aisle_id department_id
6433 0.941176 68 Raw Veggie Wrappers 13 20
2075 0.931034 87 Serenity Ultimate Extrema Overnight Pads 126 11
27740 0.920792 101 Chocolate Love Bar 45 19
13875 0.911111 45 Simply Sleep Nighttime Sleep Aid 6 2
31418 0.900000 60 Sparking Water 115 7
35604 0.900000 100 Maca Buttercups 45 19
36543 0.895522 67 Bars Peanut Butter 88 13
26093 0.893939 66 Soy Crisps Lightly Salted 107 19
38251 0.891892 111 Benchbreak Chardonnay 62 5
36801 0.885417 96 Organic Blueberry B Mega 31 7
# User Metric s – Create user metric s base d on orders and then on products contained in those or ders
orders$order_dow_hod <- orders$order_dow * 24 + orders$order_hour_of_day

user_metrics <- orders %>% 
  filter(eval_set == "prior") %>%
  group_by(user_id) %>%
  summarise(
    user_total_orders = max(order_number),
    user_mean_dow = mean(order_dow),
    user_mean_hod = mean(order_hour_of_day),
    user_mean_dow_hod = mean(order_dow_hod),
    user_order_frequency = mean(days_since_prior_order, na.rm=T)
  )

test_train_orders <-  orders %>% 
  filter(eval_set != "prior") %>%
  select(user_id, order_id, eval_set, days_since_prior_order)                                

user_metrics <- user_metrics %>%
  inner_join(test_train_orders)

User Metrics - Contd…

or_pr_prior <- or_pr_prior %>%
  inner_join(orders, by = "order_id")

user_metrics2 <- or_pr_prior %>%
  group_by(user_id) %>%
  summarise(
    user_total_products =n(),
    user_distinct_products = n_distinct(product_id),
    user_total_pr_reorders = sum(reordered)
  )

user_metrics2$user_pr_reorder_ratio = user_metrics2$user_total_pr_reorders/ 
  user_metrics2$user_total_products

Create user and product combination metrics

user_product_metrics <- or_pr_prior %>%
  group_by(user_id, product_id) %>%
  summarise(up_total_orders = n(),
            up_mean_add_to_cart= mean(add_to_cart_order),        
            up_total_reorders = sum(reordered)
  ) 

rm(or_pr_prior)
gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   976592  52.2   18207705  972.4  22759632 1215.5
## Vcells 78746126 600.8  438413500 3344.9 451345029 3443.5
user_product_metrics <- user_product_metrics %>% 
  inner_join(product_metrics, by= "product_id") %>%
  inner_join(user_metrics, by= "user_id") %>%
  inner_join(user_metrics2, by= "user_id")

rm(products, aisles, departments,product_metrics, user_metrics, user_metrics2)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells    976639   52.2   11652931  622.4  22759632 1215.5
## Vcells 255915210 1952.5  543736772 4148.4 484661114 3697.7

Few more features…

user_product_metrics$up_ttlOrd_ttlusrOrd_ratio = user_product_metrics$up_total_orders / user_product_metrics$user_total_orders
user_product_metrics$up_ttlOrd_ttlprOrd_ratio = user_product_metrics$up_total_orders / user_product_metrics$pr_total_orders
user_product_metrics$up_ATC_pr_ATC_ratio = user_product_metrics$up_mean_add_to_cart / user_product_metrics$pr_mean_add_to_cart
user_product_metrics$up_reorder_ratio = user_product_metrics$up_total_reorders / user_product_metrics$user_total_orders
user_product_metrics$user_total_pr_reorder_ratio <- user_product_metrics$user_total_pr_reorders/
  user_product_metrics$user_total_products

Seperate data into training and test sets

train <- user_product_metrics[user_product_metrics$eval_set == "train",]
test <- user_product_metrics[user_product_metrics$eval_set == "test",]

train <- train %>% 
  left_join(or_pr_train %>% select(user_id, product_id, reordered), 
            by = c("user_id", "product_id"))
train$reordered[is.na(train$reordered)] <- 0

train$eval_set <- NULL
train$user_id <- NULL
train$product_id <- NULL
train$order_id <- NULL

test$eval_set <- NULL
test$user_id <- NULL
test$reordered <- NULL

rm(or_pr_train, orders, test_train_orders, user_product_metrics)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   1052798   56.3    4773040  255.0  22759632 1215.5
## Vcells 290710681 2218.0  939868341 7170.7 827422876 6312.8

Build Model

This step is the most important part of the process for model quality.

We use train data. As explained above, the data and label are stored in the list. Each variable is a list of two things, a label, and data. A label is the result of our data set, which means that it is a binary classification that we will try to predict. In our example, the label we want to predict is the “reordered” column. For xgboost, we convert categorical data into fictitious variables when constructing the model that we need.

we use this feature:

xgb.DMatrix [dtrain <- xgb.DMatrix (data = train data, label = traindata, label = train label) [bstDMatrix <- xgboost (data = dtrain) Details here: https://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html The advantage of using decision tree methods ensembles, such as gradient acceleration, is that they can automatically evaluate the importance of an object from a trained predictive model. Important in Xgboost can be done using xgb.importance (). The result of this function is a table with all the variables, as well as their gain, cover and frequency. The increase is an improvement in accuracy that brings the function to the affiliates to which it is located. The cover measures the relative number of observations on a specific basis. Frequency is an easy way to measure the gain. It simply counts the number of times a function is used in all formed trees. We do not have to use it (if you do not know why you want to use it). The Gain column will provide the information we are looking for Finally, we can construct the results (trees) from our model using xgb.plot.importance ()

There is a list of parameters which are necessary to train our decision tree model These are the following:

objective:

We need to specify the type of learner we want which includes linear regression, logistic regression, poisson regression etc eval_metric: We need to specify the evaluation metrics for validation data, a default metric will be assigned according to objective (rmse for regression, and error for classification, mean average precision for ranking) eta: The default value is set to 0.3. We need to specify step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features and eta actually shrinks the feature weights to make the boosting process more conservative. The range is 0 to 1. Low eta value means model is more robust to overfitting max_depth: The default value is set to 6. Is need to specify the maximum depth or splits of a tree. The range is 1 to ??? min_child_weight: The default value is set to 1. Is need to specify the minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. The range is 0 to ??? gamma: The default value is set to 0. Is need to specify minimum loss reduction required to make a further partition on a leaf node of the tree. The larger, the more conservative the algorithm will be. The range is 0 to ??? subsample: The default value is set to 1. Is need to specify the subsample ratio of the training instance. Setting it to 0.5 means that XGBoost randomly collected half of the data instances to grow trees and this will prevent overfitting. The range is 0 to 1 colsample_bytree: The default value is set to 1. Is need to specify the subsample ratio of columns when constructing each tree. The range is 0 to 1 alpha: These are regularization term on weights. ??lpha default value assumed is 0 lambda: These are regularization term on weights. Lambda default value assumed is 1 nround: The number of trees to the model ```

library(xgboost)
params <- list(
  "objective"           = "reg:logistic",
  "eval_metric"         = "logloss",
  "eta"                 = 0.1,
  "max_depth"           = 6,
  "min_child_weight"    = 10,
  "gamma"               = 0.70,
  "subsample"           = 0.77,
  "colsample_bytree"    = 0.95,
  "alpha"               = 2e-05,
  "lambda"              = 10
)

train <- as.data.frame(train)
test <- as.data.frame(test)

subtrain <- train # %>% sample_frac(0.1)
X <- xgb.DMatrix(as.matrix(subtrain %>% select(-reordered)), label = subtrain$reordered)
model <- xgboost(data = X, params = params, nrounds = 30)
## [12:18:27] Tree method is automatically selected to be 'approx' for faster speed. to use old behavior(exact greedy algorithm on single machine), set tree_method to 'exact'
## [1]  train-logloss:0.626671 
## [2]  train-logloss:0.572368 
## [3]  train-logloss:0.527316 
## [4]  train-logloss:0.489568 
## [5]  train-logloss:0.457691 
## [6]  train-logloss:0.430599 
## [7]  train-logloss:0.407456 
## [8]  train-logloss:0.387610 
## [9]  train-logloss:0.370535 
## [10] train-logloss:0.355820 
## [11] train-logloss:0.343101 
## [12] train-logloss:0.332121 
## [13] train-logloss:0.322606 
## [14] train-logloss:0.314357 
## [15] train-logloss:0.307211 
## [16] train-logloss:0.301041 
## [17] train-logloss:0.295682 
## [18] train-logloss:0.291027 
## [19] train-logloss:0.286999 
## [20] train-logloss:0.283512 
## [21] train-logloss:0.280478 
## [22] train-logloss:0.277865 
## [23] train-logloss:0.275595 
## [24] train-logloss:0.273631 
## [25] train-logloss:0.271942 
## [26] train-logloss:0.270475 
## [27] train-logloss:0.269208 
## [28] train-logloss:0.268118 
## [29] train-logloss:0.267173 
## [30] train-logloss:0.266367
importance <- xgb.importance(colnames(X), model = model)
xgb.ggplot.importance(importance)

rm(X, importance, subtrain)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   2074330  110.8    4773040  255.0  22759632 1215.5
## Vcells 286828963 2188.4  939868341 7170.7 827422876 6312.8
# Apply model -- Make predictions
# Now we  perform the prediction with the model we built 
# for classification of new data.

# We use the xgb.DMatrix to group our test data into a matrix
X <- xgb.DMatrix(as.matrix(test %>% select(-order_id, -product_id)))

# We perform the prediction using the predict ()
test$reordered <- predict(model, X)

# We apply a threshold so every prediction above 0.21 consider as 1
test$reordered <- (test$reordered > 0.21) * 1

# We multiply with 1 to convert the column "reordered" to numeric type
# We summarize () the table "products" where we have every order and 
# its products that were predicted as reordered
submission <- test %>%
  filter(reordered == 1) %>%
  group_by(order_id) %>%
  summarise(
    products = paste(product_id, collapse = " ")
  )
# We create the table "missing" where we have the orders 
# in which no product will be ordered according to our prediction 
# (the% in% operator is used when you want a logical value 
# for more specific elements, whether they are in longer vectors)
missing <- data.frame(
  order_id = unique(test$order_id[!test$order_id %in% submission$order_id]),
  products = "None"
)
# We record the result of the prediction into a file submit.csv

submission <- submission %>% bind_rows(missing) %>% arrange(order_id)
write.csv(submission, file = "submit.csv", row.names = F)