library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(stringr)
library(DT)
## Warning: package 'DT' was built under R version 3.4.4
orders <- fread('orders.csv')
##
Read 68.7% of 3421083 rows
Read 3421083 rows and 7 (of 7) columns from 0.101 GB file in 00:00:03
products <- fread('products.csv')
order_products <- fread('order_products__train.csv')
order_products_prior <- fread('order_products__prior.csv')
##
Read 10.4% of 32434489 rows
Read 25.9% of 32434489 rows
Read 38.5% of 32434489 rows
Read 53.1% of 32434489 rows
Read 68.0% of 32434489 rows
Read 85.1% of 32434489 rows
Read 32434489 rows and 4 (of 4) columns from 0.538 GB file in 00:00:08
aisles <- fread('aisles.csv')
departments <- fread('departments.csv')
kable is a very simple table generator.
kable(head(orders,20))
| order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
|---|---|---|---|---|---|---|
| 2539329 | 1 | prior | 1 | 2 | 8 | NA |
| 2398795 | 1 | prior | 2 | 3 | 7 | 15 |
| 473747 | 1 | prior | 3 | 3 | 12 | 21 |
| 2254736 | 1 | prior | 4 | 4 | 7 | 29 |
| 431534 | 1 | prior | 5 | 4 | 15 | 28 |
| 3367565 | 1 | prior | 6 | 2 | 7 | 19 |
| 550135 | 1 | prior | 7 | 1 | 9 | 20 |
| 3108588 | 1 | prior | 8 | 1 | 14 | 14 |
| 2295261 | 1 | prior | 9 | 1 | 16 | 0 |
| 2550362 | 1 | prior | 10 | 4 | 8 | 30 |
| 1187899 | 1 | train | 11 | 4 | 8 | 14 |
| 2168274 | 2 | prior | 1 | 2 | 11 | NA |
| 1501582 | 2 | prior | 2 | 5 | 10 | 10 |
| 1901567 | 2 | prior | 3 | 1 | 10 | 3 |
| 738281 | 2 | prior | 4 | 2 | 10 | 8 |
| 1673511 | 2 | prior | 5 | 3 | 11 | 8 |
| 1199898 | 2 | prior | 6 | 2 | 9 | 13 |
| 3194192 | 2 | prior | 7 | 2 | 12 | 14 |
| 788338 | 2 | prior | 8 | 1 | 15 | 27 |
| 1718559 | 2 | prior | 9 | 2 | 9 | 8 |
This file gives a list of all orders we have in the dataset. 1 row per order. For example, we can see that user 1 has 11 orders, 1 of which is in the train set, and 10 of which are prior orders. The orders.csv doesn’t tell us about which products were ordered. This is contained in the order_products.csv
glimpse(orders)
## Observations: 3,421,083
## Variables: 7
## $ order_id <int> 2539329, 2398795, 473747, 2254736, 4315...
## $ user_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...
## $ eval_set <chr> "prior", "prior", "prior", "prior", "pr...
## $ order_number <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2...
## $ order_dow <int> 2, 3, 3, 4, 4, 2, 1, 1, 1, 4, 4, 2, 5, ...
## $ order_hour_of_day <int> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11...
## $ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, ...
We should do some recoding and convert character variables to factors.
orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))
orders %>%
ggplot(aes(x=order_hour_of_day)) +
geom_histogram(stat="count",fill="blue")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
This shows there is the effect of hours of day in volume of Orders. Most orders are between 8.00-18.00
orders %>%
ggplot(aes(x=order_dow)) +
geom_histogram(stat="count",fill="brown")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
There is a clear effect of day of the week. Most orders are on days 0 and 1. Unfortunately there is no info regarding which values represent which day.
orders %>%
ggplot(aes(x=days_since_prior_order)) +
geom_histogram(stat="count",fill="green")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).
People are ordering after 1 week or 1 month
orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")
##How many items do people buy? Let’s have a look how many items are in the orders.
head(order_products)
## order_id product_id add_to_cart_order reordered
## 1: 1 49302 1 1
## 2: 1 11109 2 1
## 3: 1 10246 3 0
## 4: 1 49683 4 0
## 5: 1 43633 5 1
## 6: 1 13176 6 0
head(order_products %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order)),10)
## # A tibble: 10 x 2
## order_id n_items
## <int> <int>
## 1 1 8
## 2 36 8
## 3 38 9
## 4 96 7
## 5 98 49
## 6 112 11
## 7 170 17
## 8 218 5
## 9 226 13
## 10 349 11
order_to_totalItems<-
order_products %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order)) %>% arrange( desc(n_items) )
mean(order_to_totalItems$n_items)
## [1] 10.55276
median(order_to_totalItems$n_items)
## [1] 9
order_products %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order)) %>%
ggplot(aes(x=n_items))+
geom_histogram(stat="count",fill="red") +
geom_rug()+
coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
We can see that people most often order around 9-10 items.
Let’s have a look which products are sold most often.
tmp <- order_products %>%
group_by(product_id) %>%
summarize(count = n()) %>%
top_n(5, wt = count) %>%
left_join(select(products,product_id,product_name),by="product_id") %>%
arrange(desc(count))
kable(tmp)
| product_id | count | product_name |
|---|---|---|
| 24852 | 18726 | Banana |
| 13176 | 15480 | Bag of Organic Bananas |
| 21137 | 10894 | Organic Strawberries |
| 21903 | 9784 | Organic Baby Spinach |
| 47626 | 8135 | Large Lemon |
tmp %>%
ggplot(aes(x=reorder(product_name,-count), y=count))+
geom_bar(stat="identity",fill="red")+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())
The most often sold item is Banana
tmp <- order_products %>%
group_by(reordered) %>%
summarize(count = n()) %>%
mutate(reordered = as.factor(reordered)) %>%
mutate(proportion = count/sum(count))
kable(tmp)
| reordered | count | proportion |
|---|---|---|
| 0 | 555793 | 0.4014056 |
| 1 | 828824 | 0.5985944 |
tmp %>%
ggplot(aes(x=reordered,y=count,fill=reordered))+
geom_bar(stat="identity")
Close to 60% of the ordered items are reorders.
Now here it becomes really interesting. These 10 products have the highest probability of being reordered.
tmp <-order_products %>%
group_by(product_id) %>%
summarize(proportion_reordered = mean(reordered), n=n()) %>%
filter(n>40) %>%
top_n(10,wt=proportion_reordered) %>%
arrange(desc(proportion_reordered)) %>%
left_join(products,by="product_id")
kable(tmp)
| product_id | proportion_reordered | n | product_name | aisle_id | department_id |
|---|---|---|---|---|---|
| 1729 | 0.9347826 | 92 | 2% Lactose Free Milk | 84 | 16 |
| 20940 | 0.9130435 | 368 | Organic Low Fat Milk | 84 | 16 |
| 12193 | 0.8983051 | 59 | 100% Florida Orange Juice | 98 | 7 |
| 21038 | 0.8888889 | 81 | Organic Spelt Tortillas | 128 | 3 |
| 31764 | 0.8888889 | 45 | Original Sparkling Seltzer Water Cans | 115 | 7 |
| 24852 | 0.8841717 | 18726 | Banana | 24 | 4 |
| 117 | 0.8833333 | 120 | Petit Suisse Fruit | 2 | 16 |
| 39180 | 0.8819876 | 483 | Organic Lowfat 1% Milk | 84 | 16 |
| 12384 | 0.8810409 | 269 | Organic Lactose Free 1% Lowfat Milk | 91 | 16 |
| 24024 | 0.8785249 | 461 | 1% Lowfat Milk | 84 | 16 |
tmp %>%
ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
geom_bar(stat="identity",fill="red")+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))
tmp <- order_products %>%
group_by(product_id, add_to_cart_order) %>%
summarize(count = n()) %>% mutate(pct=count/sum(count)) %>%
filter(add_to_cart_order == 1, count>10) %>%
arrange(desc(pct)) %>%
left_join(products,by="product_id") %>%
select(product_name, pct, count) %>%
ungroup() %>%
top_n(10, wt=pct)
## Adding missing grouping variables: `product_id`
kable(tmp)
| product_id | product_name | pct | count |
|---|---|---|---|
| 45004 | White Multifold Towels | 0.6610169 | 39 |
| 11885 | Sparkling Water, Bottles | 0.5942029 | 41 |
| 13128 | Purified Alkalkine Water with Minerals pH10 | 0.5714286 | 12 |
| 4100 | Organic Dark Roast | 0.5600000 | 14 |
| 1729 | 2% Lactose Free Milk | 0.5217391 | 48 |
| 6729 | Cookie Tray | 0.4861111 | 35 |
| 9285 | Boneless Pork Shoulder Butt | 0.4814815 | 13 |
| 6848 | Party Tumblers | 0.4615385 | 12 |
| 12640 | Sport Bottle with Flip Cap Natural Spring Water | 0.4615385 | 12 |
| 26405 | XL Pick-A-Size Paper Towel Rolls | 0.4476190 | 47 |
tmp %>%
ggplot(aes(x=reorder(product_name,-pct), y=pct))+
geom_bar(stat="identity",fill="red")+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))
People seem to be quite certain about Multifold Towels and if they buy them, put them into their cart first in 66% of the time.