Loding libraries

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(stringr)
library(DT)
## Warning: package 'DT' was built under R version 3.4.4

Read in the data

orders <- fread('orders.csv')
## 
Read 68.7% of 3421083 rows
Read 3421083 rows and 7 (of 7) columns from 0.101 GB file in 00:00:03
products <- fread('products.csv')
order_products <- fread('order_products__train.csv')
order_products_prior <- fread('order_products__prior.csv')
## 
Read 10.4% of 32434489 rows
Read 25.9% of 32434489 rows
Read 38.5% of 32434489 rows
Read 53.1% of 32434489 rows
Read 68.0% of 32434489 rows
Read 85.1% of 32434489 rows
Read 32434489 rows and 4 (of 4) columns from 0.538 GB file in 00:00:08
aisles <- fread('aisles.csv')
departments <- fread('departments.csv')

Analysing Orders

kable is a very simple table generator.

kable(head(orders,20))
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
2539329 1 prior 1 2 8 NA
2398795 1 prior 2 3 7 15
473747 1 prior 3 3 12 21
2254736 1 prior 4 4 7 29
431534 1 prior 5 4 15 28
3367565 1 prior 6 2 7 19
550135 1 prior 7 1 9 20
3108588 1 prior 8 1 14 14
2295261 1 prior 9 1 16 0
2550362 1 prior 10 4 8 30
1187899 1 train 11 4 8 14
2168274 2 prior 1 2 11 NA
1501582 2 prior 2 5 10 10
1901567 2 prior 3 1 10 3
738281 2 prior 4 2 10 8
1673511 2 prior 5 3 11 8
1199898 2 prior 6 2 9 13
3194192 2 prior 7 2 12 14
788338 2 prior 8 1 15 27
1718559 2 prior 9 2 9 8

This file gives a list of all orders we have in the dataset. 1 row per order. For example, we can see that user 1 has 11 orders, 1 of which is in the train set, and 10 of which are prior orders. The orders.csv doesn’t tell us about which products were ordered. This is contained in the order_products.csv

glimpse(orders)
## Observations: 3,421,083
## Variables: 7
## $ order_id               <int> 2539329, 2398795, 473747, 2254736, 4315...
## $ user_id                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...
## $ eval_set               <chr> "prior", "prior", "prior", "prior", "pr...
## $ order_number           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2...
## $ order_dow              <int> 2, 3, 3, 4, 4, 2, 1, 1, 1, 4, 4, 2, 5, ...
## $ order_hour_of_day      <int> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11...
## $ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, ...

Recode variables

We should do some recoding and convert character variables to factors.

orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))

Lets Analyise and find when do people most often Order?

orders %>% 
  ggplot(aes(x=order_hour_of_day)) + 
  geom_histogram(stat="count",fill="blue")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

This shows there is the effect of hours of day in volume of Orders. Most orders are between 8.00-18.00

Lets now analyze which Day of Week have the maximun orders

orders %>% 
  ggplot(aes(x=order_dow)) + 
  geom_histogram(stat="count",fill="brown")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

There is a clear effect of day of the week. Most orders are on days 0 and 1. Unfortunately there is no info regarding which values represent which day.

When do customers order again?

orders %>% 
  ggplot(aes(x=days_since_prior_order)) + 
  geom_histogram(stat="count",fill="green")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).

People are ordering after 1 week or 1 month

orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")

##How many items do people buy? Let’s have a look how many items are in the orders.

head(order_products)
##    order_id product_id add_to_cart_order reordered
## 1:        1      49302                 1         1
## 2:        1      11109                 2         1
## 3:        1      10246                 3         0
## 4:        1      49683                 4         0
## 5:        1      43633                 5         1
## 6:        1      13176                 6         0
head(order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)),10)
## # A tibble: 10 x 2
##    order_id n_items
##       <int>   <int>
##  1        1       8
##  2       36       8
##  3       38       9
##  4       96       7
##  5       98      49
##  6      112      11
##  7      170      17
##  8      218       5
##  9      226      13
## 10      349      11
order_to_totalItems<-
order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>% arrange( desc(n_items) )


mean(order_to_totalItems$n_items)
## [1] 10.55276
median(order_to_totalItems$n_items)
## [1] 9
order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>%
  ggplot(aes(x=n_items))+
  geom_histogram(stat="count",fill="red") + 
  geom_rug()+
  coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

We can see that people most often order around 9-10 items.

Top Selling Product

Let’s have a look which products are sold most often.

tmp <- order_products %>% 
  group_by(product_id) %>% 
  summarize(count = n()) %>% 
  top_n(5, wt = count) %>%
  left_join(select(products,product_id,product_name),by="product_id") %>%
  arrange(desc(count)) 
kable(tmp)
product_id count product_name
24852 18726 Banana
13176 15480 Bag of Organic Bananas
21137 10894 Organic Strawberries
21903 9784 Organic Baby Spinach
47626 8135 Large Lemon
tmp %>% 
  ggplot(aes(x=reorder(product_name,-count), y=count))+
  geom_bar(stat="identity",fill="red")+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())

The most often sold item is Banana

How often do people order the same items again?

tmp <- order_products %>% 
  group_by(reordered) %>% 
  summarize(count = n()) %>% 
  mutate(reordered = as.factor(reordered)) %>%
  mutate(proportion = count/sum(count))
kable(tmp)
reordered count proportion
0 555793 0.4014056
1 828824 0.5985944
tmp %>% 
  ggplot(aes(x=reordered,y=count,fill=reordered))+
  geom_bar(stat="identity")

Close to 60% of the ordered items are reorders.

Most often reordered

Now here it becomes really interesting. These 10 products have the highest probability of being reordered.

tmp <-order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reordered = mean(reordered), n=n()) %>% 
  filter(n>40) %>% 
  top_n(10,wt=proportion_reordered) %>% 
  arrange(desc(proportion_reordered)) %>% 
  left_join(products,by="product_id")

kable(tmp)
product_id proportion_reordered n product_name aisle_id department_id
1729 0.9347826 92 2% Lactose Free Milk 84 16
20940 0.9130435 368 Organic Low Fat Milk 84 16
12193 0.8983051 59 100% Florida Orange Juice 98 7
21038 0.8888889 81 Organic Spelt Tortillas 128 3
31764 0.8888889 45 Original Sparkling Seltzer Water Cans 115 7
24852 0.8841717 18726 Banana 24 4
117 0.8833333 120 Petit Suisse Fruit 2 16
39180 0.8819876 483 Organic Lowfat 1% Milk 84 16
12384 0.8810409 269 Organic Lactose Free 1% Lowfat Milk 91 16
24024 0.8785249 461 1% Lowfat Milk 84 16
tmp %>% 
  ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
  geom_bar(stat="identity",fill="red")+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))

Which item do people put into the cart first?

tmp <- order_products %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% mutate(pct=count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by="product_id") %>% 
  select(product_name, pct, count) %>% 
  ungroup() %>% 
  top_n(10, wt=pct)
## Adding missing grouping variables: `product_id`
kable(tmp)
product_id product_name pct count
45004 White Multifold Towels 0.6610169 39
11885 Sparkling Water, Bottles 0.5942029 41
13128 Purified Alkalkine Water with Minerals pH10 0.5714286 12
4100 Organic Dark Roast 0.5600000 14
1729 2% Lactose Free Milk 0.5217391 48
6729 Cookie Tray 0.4861111 35
9285 Boneless Pork Shoulder Butt 0.4814815 13
6848 Party Tumblers 0.4615385 12
12640 Sport Bottle with Flip Cap Natural Spring Water 0.4615385 12
26405 XL Pick-A-Size Paper Towel Rolls 0.4476190 47
tmp %>% 
  ggplot(aes(x=reorder(product_name,-pct), y=pct))+
  geom_bar(stat="identity",fill="red")+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))

People seem to be quite certain about Multifold Towels and if they buy them, put them into their cart first in 66% of the time.