Loding libraries

library(data.table)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(knitr)
library(stringr)
library(DT)

## Warning: package 'DT' was built under R version 3.4.4

Read in the data

orders <- fread('orders.csv')

## 
Read 68.7% of 3421083 rows
Read 3421083 rows and 7 (of 7) columns from 0.101 GB file in 00:00:03

products <- fread('products.csv')
order_products <- fread('order_products__train.csv')
order_products_prior <- fread('order_products__prior.csv')

## 
Read 10.4% of 32434489 rows
Read 25.9% of 32434489 rows
Read 38.5% of 32434489 rows
Read 53.1% of 32434489 rows
Read 68.0% of 32434489 rows
Read 85.1% of 32434489 rows
Read 32434489 rows and 4 (of 4) columns from 0.538 GB file in 00:00:08

aisles <- fread('aisles.csv')
departments <- fread('departments.csv')

Analysing Orders

kable is a very simple table generator.

kable(head(orders,20))

order_id	user_id	eval_set	order_number	order_dow	order_hour_of_day	days_since_prior_order
2539329	1	prior	1	2	8	NA
2398795	1	prior	2	3	7	15
473747	1	prior	3	3	12	21
2254736	1	prior	4	4	7	29
431534	1	prior	5	4	15	28
3367565	1	prior	6	2	7	19
550135	1	prior	7	1	9	20
3108588	1	prior	8	1	14	14
2295261	1	prior	9	1	16	0
2550362	1	prior	10	4	8	30
1187899	1	train	11	4	8	14
2168274	2	prior	1	2	11	NA
1501582	2	prior	2	5	10	10
1901567	2	prior	3	1	10	3
738281	2	prior	4	2	10	8
1673511	2	prior	5	3	11	8
1199898	2	prior	6	2	9	13
3194192	2	prior	7	2	12	14
788338	2	prior	8	1	15	27
1718559	2	prior	9	2	9	8

This file gives a list of all orders we have in the dataset. 1 row per order. For example, we can see that user 1 has 11 orders, 1 of which is in the train set, and 10 of which are prior orders. The orders.csv doesn’t tell us about which products were ordered. This is contained in the order_products.csv

glimpse(orders)

## Observations: 3,421,083
## Variables: 7
## $ order_id               <int> 2539329, 2398795, 473747, 2254736, 4315...
## $ user_id                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...
## $ eval_set               <chr> "prior", "prior", "prior", "prior", "pr...
## $ order_number           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2...
## $ order_dow              <int> 2, 3, 3, 4, 4, 2, 1, 1, 1, 4, 4, 2, 5, ...
## $ order_hour_of_day      <int> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11...
## $ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, ...

Recode variables

We should do some recoding and convert character variables to factors.

orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))

Lets Analyise and find when do people most often Order?

orders %>% 
  ggplot(aes(x=order_hour_of_day)) + 
  geom_histogram(stat="count",fill="blue")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

This shows there is the effect of hours of day in volume of Orders. Most orders are between 8.00-18.00

Lets now analyze which Day of Week have the maximun orders

orders %>% 
  ggplot(aes(x=order_dow)) + 
  geom_histogram(stat="count",fill="brown")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

There is a clear effect of day of the week. Most orders are on days 0 and 1. Unfortunately there is no info regarding which values represent which day.

When do customers order again?

orders %>% 
  ggplot(aes(x=days_since_prior_order)) + 
  geom_histogram(stat="count",fill="green")

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Removed 206209 rows containing non-finite values (stat_count).

People are ordering after 1 week or 1 month

orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")

##How many items do people buy? Let’s have a look how many items are in the orders.

head(order_products)

##    order_id product_id add_to_cart_order reordered
## 1:        1      49302                 1         1
## 2:        1      11109                 2         1
## 3:        1      10246                 3         0
## 4:        1      49683                 4         0
## 5:        1      43633                 5         1
## 6:        1      13176                 6         0

head(order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)),10)

## # A tibble: 10 x 2
##    order_id n_items
##       <int>   <int>
##  1        1       8
##  2       36       8
##  3       38       9
##  4       96       7
##  5       98      49
##  6      112      11
##  7      170      17
##  8      218       5
##  9      226      13
## 10      349      11

order_to_totalItems<-
order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>% arrange( desc(n_items) )


mean(order_to_totalItems$n_items)

## [1] 10.55276

median(order_to_totalItems$n_items)

## [1] 9

order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>%
  ggplot(aes(x=n_items))+
  geom_histogram(stat="count",fill="red") + 
  geom_rug()+
  coord_cartesian(xlim=c(0,80))

## Warning: Ignoring unknown parameters: binwidth, bins, pad

We can see that people most often order around 9-10 items.

Top Selling Product

Let’s have a look which products are sold most often.

tmp <- order_products %>% 
  group_by(product_id) %>% 
  summarize(count = n()) %>% 
  top_n(5, wt = count) %>%
  left_join(select(products,product_id,product_name),by="product_id") %>%
  arrange(desc(count)) 
kable(tmp)

product_id	count	product_name
24852	18726	Banana
13176	15480	Bag of Organic Bananas
21137	10894	Organic Strawberries
21903	9784	Organic Baby Spinach
47626	8135	Large Lemon

tmp %>% 
  ggplot(aes(x=reorder(product_name,-count), y=count))+
  geom_bar(stat="identity",fill="red")+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())

The most often sold item is Banana

How often do people order the same items again?

tmp <- order_products %>% 
  group_by(reordered) %>% 
  summarize(count = n()) %>% 
  mutate(reordered = as.factor(reordered)) %>%
  mutate(proportion = count/sum(count))
kable(tmp)

reordered	count	proportion
0	555793	0.4014056
1	828824	0.5985944

tmp %>% 
  ggplot(aes(x=reordered,y=count,fill=reordered))+
  geom_bar(stat="identity")

Close to 60% of the ordered items are reorders.

Most often reordered

Now here it becomes really interesting. These 10 products have the highest probability of being reordered.

tmp <-order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reordered = mean(reordered), n=n()) %>% 
  filter(n>40) %>% 
  top_n(10,wt=proportion_reordered) %>% 
  arrange(desc(proportion_reordered)) %>% 
  left_join(products,by="product_id")

kable(tmp)

product_id	proportion_reordered	n	product_name	aisle_id	department_id
1729	0.9347826	92	2% Lactose Free Milk	84	16
20940	0.9130435	368	Organic Low Fat Milk	84	16
12193	0.8983051	59	100% Florida Orange Juice	98	7
21038	0.8888889	81	Organic Spelt Tortillas	128	3
31764	0.8888889	45	Original Sparkling Seltzer Water Cans	115	7
24852	0.8841717	18726	Banana	24	4
117	0.8833333	120	Petit Suisse Fruit	2	16
39180	0.8819876	483	Organic Lowfat 1% Milk	84	16
12384	0.8810409	269	Organic Lactose Free 1% Lowfat Milk	91	16
24024	0.8785249	461	1% Lowfat Milk	84	16

tmp %>% 
  ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
  geom_bar(stat="identity",fill="red")+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))

Which item do people put into the cart first?

tmp <- order_products %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% mutate(pct=count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by="product_id") %>% 
  select(product_name, pct, count) %>% 
  ungroup() %>% 
  top_n(10, wt=pct)

## Adding missing grouping variables: `product_id`

kable(tmp)

product_id	product_name	pct	count
45004	White Multifold Towels	0.6610169	39
11885	Sparkling Water, Bottles	0.5942029	41
13128	Purified Alkalkine Water with Minerals pH10	0.5714286	12
4100	Organic Dark Roast	0.5600000	14
1729	2% Lactose Free Milk	0.5217391	48
6729	Cookie Tray	0.4861111	35
9285	Boneless Pork Shoulder Butt	0.4814815	13
6848	Party Tumblers	0.4615385	12
12640	Sport Bottle with Flip Cap Natural Spring Water	0.4615385	12
26405	XL Pick-A-Size Paper Towel Rolls	0.4476190	47

tmp %>% 
  ggplot(aes(x=reorder(product_name,-pct), y=pct))+
  geom_bar(stat="identity",fill="red")+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))

People seem to be quite certain about Multifold Towels and if they buy them, put them into their cart first in 66% of the time.