Here are the libraries used in this analysis.
library(knitr) # web widget
library(ggplot2) # graphing
library(dplyr) # data summarization and manipulation
library(tidyr) # reshaping data frame
library(data.table) # fast file reading
library(treemap) # tree visualization
library(corrgram) # correlation graphics
Total 6 datasets were imported:
We wish there were more information but unfortuantely not provided, such as:
Nevertheless, data provided are still good.
setwd('./datasets')
aisles = fread('aisles.csv', stringsAsFactors = TRUE)
departments = fread('departments.csv', stringsAsFactors = TRUE)
products = fread('products.csv', stringsAsFactors = TRUE)
orders = fread('orders.csv', stringsAsFactors = TRUE)
order_products_train = fread('order_products__train.csv')
order_products_prior = fread('order_products__prior.csv')
Data that we know all possible values are recoded to factor. This includes day of week, hour of day and binary data.
orders$order_dow = as.factor(orders$order_dow)
orders$order_hour_of_day = as.factor(orders$order_hour_of_day)
order_products_train$reordered = as.factor(order_products_train$reordered)
order_products_prior$reordered = as.factor(order_products_prior$reordered)
Let’s understand the key relationship within the database schematic.
orders table:
order_products_train and order_products_prior
IMPORTANT notes for this competition:
We will create new datasets from datasets provided. This allow us to structure the data for clearer analysis prediction. Refer to the data schematic diagram, new datasets are colored in green. The ideas are:
## create user table for per train / test sets
train_users_id = orders %>%
filter(eval_set == 'train') %>%
group_by(user_id) %>%
summarize(n.orders = n())
test_users_id = orders %>%
filter(eval_set == 'test') %>%
group_by(user_id) %>%
summarize(n.orders = n())
## create table for total orders per train / test sets
left_join(orders, by='user_id') %>%
select(-eval_set, -n.orders)
test_orders = test_users_id %>%
left_join(orders, by='user_id') %>%
select(-eval_set, -n.orders)
## create prior order detail table per train / test sets
prior_products_train = train_orders %>%
select(order_id) %>%
left_join(order_products_prior, by='order_id')
prior_products_test = test_orders %>%
select(order_id) %>%
left_join(order_products_prior, by='order_id')
###### For Training Data Set, create train_users
### get the summary product freq per user
train_users =
train_orders %>%
left_join(rbind(order_products_train,prior_products_train)) %>%
left_join(products) %>%
group_by(user_id) %>%
summarize(
n.products =n_distinct(product_id),
n.departments=n_distinct(department_id),
### get the most popular product per user
train_users =
train_orders %>%
left_join(rbind(order_products_train,prior_products_train)) %>%
count(user_id, product_id) %>%
slice(which.max(n)) %>%
select (user_id, busy.product = product_id) %>%
right_join(train_users)
### get the most busy hour per user
train_users =
train_orders %>%
count(user_id, order_hour_of_day) %>%
select (user_id, busy.hour = order_hour_of_day) %>%
right_join(train_users)
### get the most busy day per user
train_users =
train_orders %>%
count(user_id, order_dow) %>%
slice(which.max(n)) %>%
select (user_id, busy.day = order_dow) %>%
right_join(train_users)
### get the days_since_prior_order stats per user
train_users =
train_orders %>%
group_by(user_id, order_id, days_since_prior_order) %>%
summarize(c.items = n()) %>%
group_by(user_id) %>%
summarize(
n.orders = n(),
n.items = sum (c.items),
avg.items= mean(c.items, na.rm=TRUE),
sd.items = sd (c.items, na.rm=TRUE),
n.wait.days = sum (days_since_prior_order, na.rm=TRUE),
avg.wait.days = mean(days_since_prior_order,na.rm=TRUE),
sd.wait.days = sd (days_since_prior_order,na.rm=TRUE)) %>%
right_join(train_users)
###### For Test Data Set, create test_users
### get the summary product freq per user
test_users =
test_orders %>%
left_join(prior_products_test) %>%
left_join(products) %>%
group_by(user_id) %>%
summarize(
n.products =n_distinct(product_id),
n.departments=n_distinct(department_id),
n.aisles =n_distinct(aisle_id))
### get the most popular product per user
test_users =
test_orders %>%
left_join(products) %>%
count(user_id, product_id) %>%
slice(which.max(n)) %>%
select (user_id, busy.product = product_id) %>%
right_join(test_users)
### get the most busy hour per user
test_users =
test_orders %>%
slice(which.max(n)) %>%
select (user_id, busy.hour = order_hour_of_day) %>%
right_join(test_users)
### get the most busy day per user
test_users =
test_orders %>%
count(user_id, order_dow) %>%
slice(which.max(n)) %>%
select (user_id, busy.day = order_dow) %>%
right_join(test_users)
### get the days_since_prior_order stats per user
test_users =
test_orders %>%
left_join(rbind(order_products_train,prior_products_train)) %>%
summarize(c.items = n()) %>%
group_by(user_id) %>%
summarize(
n.orders = n(),
n.items = sum (c.items),
avg.items= mean(c.items, na.rm=TRUE),
sd.items = sd (c.items, na.rm=TRUE),
n.wait.days = sum (days_since_prior_order, na.rm=TRUE),
avg.wait.days = mean(days_since_prior_order,na.rm=TRUE),
sd.wait.days = sd (days_since_prior_order,na.rm=TRUE)) %>%
right_join(test_users)
aisles
glimpse(aisles)
Observations: 134
Variables: 2
$ aisle_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,...
$ aisle <fctr> prepared soups salads, specialty cheeses, energy granola bars, instant foods, marinades meat preparation, other, packaged mea...
summary(aisles[,2])
aisle
air fresheners candles: 1
asian foods : 1
baby accessories : 1
baby bath body care : 1
baby food formula : 1
bakery desserts : 1
(Other) :128
departments
glimpse(departments)
Observations: 21
Variables: 2
$ department_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
$ department <fctr> frozen, other, bakery, produce, alcohol, international, beverages, pets, dry goods pasta, bulk, personal care, meat seaf...
summary(departments[,2])
department
alcohol : 1
babies : 1
bakery : 1
beverages: 1
breakfast: 1
bulk : 1
(Other) :15
products
glimpse(products)
Observations: 49,688
Variables: 4
$ product_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33...
$ product_name <fctr> Chocolate Sandwich Cookies, All-Seasons Salt, Robust Golden Unsweetened Oolong Tea, Smart Ones Classic Favorites Mini Ri...
$ aisle_id <int> 61, 104, 94, 38, 5, 11, 98, 116, 120, 115, 31, 119, 11, 74, 56, 103, 35, 79, 63, 98, 40, 20, 49, 47, 3, 41, 127, 121, 81,...
$ department_id <int> 19, 13, 7, 1, 13, 11, 7, 1, 16, 7, 7, 1, 11, 17, 18, 19, 12, 1, 9, 7, 8, 11, 12, 11, 19, 8, 11, 14, 15, 1, 4, 19, 9, 14, ...
summary(products[,-1])
product_name aisle_id department_id
#2 Coffee Filters : 1 Min. : 1.00 Min. : 1.00
#2 Cone White Coffee Filters : 1 1st Qu.: 35.00 1st Qu.: 7.00
#2 Mechanical Pencils : 1 Median : 69.00 Median :13.00
#4 Natural Brown Coffee Filters : 1 Mean : 67.77 Mean :11.73
& Go! Hazelnut Spread + Pretzel Sticks: 1 3rd Qu.:100.00 3rd Qu.:17.00
'Swingtop' Premium Lager : 1 Max. :134.00 Max. :21.00
(Other) :49682
train_users_id
glimpse(train_users_id)
Observations: 131,209
Variables: 2
$ user_id <int> 1, 2, 5, 7, 8, 9, 10, 13, 14, 17, 18, 21, 23, 24, 27, 29, 30, 34, 37, 38, 41, 42, 43, 44, 46, 47, 48, 49, 50, 52, 53, 55, 56, ...
$ n.orders <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
summary(train_users_id)
user_id n.orders
Min. : 1 Min. :1
1st Qu.: 51587 1st Qu.:1
Median :103150 Median :1
Mean :103167 Mean :1
3rd Qu.:154868 3rd Qu.:1
Max. :206209 Max. :1
test_users_id
glimpse(test_users_id)
Observations: 75,000
Variables: 2
$ user_id <int> 3, 4, 6, 11, 12, 15, 16, 19, 20, 22, 25, 26, 28, 31, 32, 33, 35, 36, 39, 40, 45, 51, 54, 57, 58, 60, 61, 68, 69, 73, 75, 77, 8...
$ n.orders <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
summary(test_users_id)
user_id n.orders
Min. : 3 Min. :1
1st Qu.: 51495 1st Qu.:1
Median :103030 Median :1
Mean :102997 Mean :1
3rd Qu.:154295 3rd Qu.:1
Max. :206208 Max. :1
train_orders
glimpse(train_orders)
Observations: 2,178,586
Variables: 6
$ user_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 7, 7, 7, 7, 7, 7, 7...
$ order_id <int> 2539329, 2398795, 473747, 2254736, 431534, 3367565, 550135, 3108588, 2295261, 2550362, 1187899, 2168274, 1501582...
$ order_number <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 1, 2, 3, 4,...
$ order_dow <fctr> 2, 3, 3, 4, 4, 2, 1, 1, 1, 4, 4, 2, 5, 1, 2, 3, 2, 2, 1, 2, 1, 1, 1, 4, 3, 1, 3, 0, 3, 1, 0, 3, 1, 0, 2, 5, 1, ...
$ order_hour_of_day <fctr> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11, 10, 10, 10, 11, 9, 12, 15, 9, 11, 10, 9, 11, 10, 11, 12, 16, 18, 18, 1...
$ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, 14, NA, 10, 3, 8, 8, 13, 14, 27, 8, 6, 30, 28, 30, 13, 30, NA, 11, 10, 19...
summary(train_orders[,c('order_dow','order_hour_of_day','days_since_prior_order')])
order_dow order_hour_of_day days_since_prior_order
0:383657 10 : 183465 Min. : 0.00
1:374368 11 : 181129 1st Qu.: 4.00
2:297071 15 : 180622 Median : 7.00
3:277672 14 : 179637 Mean :11.11
4:271077 13 : 176833 3rd Qu.:15.00
5:289006 12 : 173706 Max. :30.00
6:285735 (Other):1103194 NA's :131209
test_orders
glimpse(test_orders)
Observations: 1,242,497
Variables: 6
$ user_id <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 6, 6, 6, 6, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12,...
$ order_id <int> 1374495, 444309, 3002854, 2037211, 2710558, 1972919, 1839752, 3225766, 3160850, 676467, 521107, 1402502, 2774568...
$ order_number <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, ...
$ order_dow <fctr> 1, 3, 3, 2, 0, 0, 0, 0, 0, 3, 0, 1, 5, 6, 4, 4, 5, 5, 3, 5, 4, 2, 3, 0, 5, 5, 5, 5, 5, 5, 6, 1, 5, 5, 3, 1, 1, ...
$ order_hour_of_day <fctr> 14, 19, 16, 18, 17, 16, 15, 17, 16, 16, 18, 15, 15, 11, 11, 15, 13, 13, 12, 18, 16, 18, 16, 11, 11, 10, 13, 11,...
$ days_since_prior_order <dbl> NA, 9, 21, 20, 12, 7, 7, 7, 7, 17, 11, 15, 11, NA, 19, 21, 15, 0, 30, NA, 6, 12, 22, NA, 12, 14, 30, 30, 7, 30, ...
summary(test_orders[,c('order_dow','order_hour_of_day','days_since_prior_order')])
order_dow order_hour_of_day days_since_prior_order
0:217248 10 :104953 Min. : 0.00
1:213110 11 :103599 1st Qu.: 4.00
2:170189 14 :103405 Median : 7.00
3:159300 15 :103017 Mean :11.13
4:155262 13 :101166 3rd Qu.:15.00
5:164362 12 : 99135 Max. :30.00
6:163026 (Other):627222 NA's :75000
order_products_prior
glimpse(order_products_train)
Observations: 1,384,617
Variables: 4
$ order_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 36, 36, 36, 36, 36, 36, 36, 36, 38, 38, 38, 38, 38, 38, 38, 38, 38, 96, 96, 96, 96, 96, 96, 9...
$ product_id <int> 49302, 11109, 10246, 49683, 43633, 13176, 47209, 22035, 39612, 19660, 49235, 43086, 46620, 34497, 48679, 46979, 11913...
$ add_to_cart_order <int> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, ...
$ reordered <fctr> 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
summary(order_products_train[,c('add_to_cart_order','reordered')])
add_to_cart_order reordered
Min. : 1.000 0:555793
1st Qu.: 3.000 1:828824
Median : 7.000
Mean : 8.758
3rd Qu.:12.000
Max. :80.000
prior_products_train
glimpse(prior_products_train)
Observations: 20,773,200
Variables: 4
$ order_id <int> 2539329, 2539329, 2539329, 2539329, 2539329, 2398795, 2398795, 2398795, 2398795, 2398795, 2398795, 473747, 473747, 47...
$ product_id <int> 196, 14084, 12427, 26088, 26405, 196, 10258, 12427, 13176, 26088, 13032, 196, 12427, 10258, 25133, 30450, 196, 12427,...
$ add_to_cart_order <int> 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, ...
$ reordered <fctr> 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
summary(prior_products_train[,c('add_to_cart_order','reordered')])
add_to_cart_order reordered
Min. : 1.00 0 : 8474661
1st Qu.: 3.00 1 :12167330
Median : 6.00 NA's: 131209
Mean : 8.34
3rd Qu.: 11.00
Max. :145.00
NA's :131209
Number of unique order_id = 131209
prior_products_test
glimpse(prior_products_test)
Observations: 11,867,498
Variables: 4
$ order_id <int> 1374495, 1374495, 1374495, 1374495, 1374495, 1374495, 1374495, 1374495, 1374495, 1374495, 444309, 444309, 444309, 444...
$ product_id <int> 9387, 17668, 15143, 16797, 39190, 47766, 21903, 39922, 24810, 32402, 38596, 21903, 248, 40604, 8021, 17668, 21137, 23...
$ add_to_cart_order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7, 8, 9,...
$ reordered <fctr> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,...
summary(prior_products_test[,c('add_to_cart_order','reordered')])
add_to_cart_order reordered
Min. : 1.00 0 :4833292
1st Qu.: 3.00 1 :6959206
Median : 6.00 NA's: 75000
Mean : 8.37
3rd Qu.: 11.00
Max. :109.00
NA's :75000
train_users
glimpse(train_users)
Observations: 131,209
Variables: 14
$ user_id <int> 1, 2, 5, 7, 8, 9, 10, 13, 14, 17, 18, 21, 23, 24, 27, 29, 30, 34, 37, 38, 41, 42, 43, 44, 46, 47, 48, 49, 50, 52, 53, 55,...
$ n.orders <int> 11, 15, 5, 21, 4, 4, 6, 13, 14, 41, 7, 34, 5, 19, 82, 19, 9, 6, 24, 13, 6, 17, 12, 4, 20, 6, 11, 9, 68, 28, 4, 8, 13, 11,...
$ n.items <int> 71, 227, 47, 216, 68, 99, 148, 87, 222, 301, 51, 212, 53, 40, 786, 243, 13, 38, 201, 215, 81, 141, 181, 44, 164, 33, 127,...
$ avg.items <dbl> 6.454545, 15.133333, 9.400000, 10.285714, 17.000000, 24.750000, 24.666667, 6.692308, 15.857143, 7.341463, 7.285714, 6.235...
$ sd.items <dbl> 2.3393861, 7.2196821, 2.7018512, 5.7458059, 3.6514837, 10.4043260, 16.4032517, 1.9315199, 9.0626634, 2.9377692, 4.1115401...
$ n.wait.days <dbl> 190, 228, 46, 209, 70, 66, 109, 92, 276, 320, 35, 345, 74, 264, 359, 209, 173, 110, 311, 261, 133, 222, 130, 90, 330, 48,...
$ avg.wait.days <dbl> 19.000000, 16.285714, 11.500000, 10.450000, 23.333333, 22.000000, 21.800000, 7.666667, 21.230769, 8.000000, 5.833333, 10....
$ sd.wait.days <dbl> 9.030811, 10.268912, 5.446712, 8.786802, 11.547005, 13.856406, 8.555700, 1.922751, 11.121704, 7.696153, 2.926887, 7.47533...
$ busy.day <fctr> 4, 1, 0, 0, 1, 0, 3, 0, 5, 1, 6, 1, 5, 0, 2, 5, 1, 4, 5, 0, 0, 0, 4, 0, 0, 2, 5, 2, 1, 1, 1, 2, 1, 0, 4, 1, 6, 0, 3, 1, ...
$ busy.hour <fctr> 7, 10, 18, 9, 0, 10, 15, 12, 8, 12, 16, 9, 12, 8, 14, 11, 13, 15, 14, 9, 20, 13, 12, 9, 16, 22, 9, 11, 11, 10, 10, 13, 8...
$ busy.product <int> 196, 32792, 11777, 40852, 23165, 27973, 16797, 27086, 29509, 7350, 36216, 23729, 13544, 31222, 2966, 39170, 21386, 3957, ...
$ n.products <int> 20, 122, 29, 70, 51, 59, 99, 31, 146, 87, 36, 104, 43, 19, 224, 68, 7, 29, 112, 98, 59, 62, 102, 37, 63, 28, 67, 51, 90, ...
$ n.departments <int> 8, 14, 11, 13, 9, 14, 11, 13, 18, 14, 11, 13, 13, 11, 12, 14, 5, 10, 16, 15, 14, 13, 17, 15, 15, 14, 13, 15, 12, 9, 12, 1...
$ n.aisles <int> 14, 38, 18, 35, 20, 27, 30, 19, 54, 36, 24, 39, 26, 15, 44, 44, 5, 19, 36, 42, 31, 37, 46, 27, 33, 25, 32, 28, 39, 28, 26...
summary(train_users)
user_id n.orders n.items avg.items sd.items n.wait.days avg.wait.days sd.wait.days busy.day
Min. : 1 Min. : 4.0 Min. : 5.0 Min. : 1.016 Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.000 0:40155
1st Qu.: 51587 1st Qu.: 6.0 1st Qu.: 48.0 1st Qu.: 5.971 1st Qu.: 2.500 1st Qu.: 89.0 1st Qu.: 9.80 1st Qu.: 5.374 1:28040
Median :103150 Median : 10.0 Median : 95.0 Median : 9.083 Median : 3.904 Median :151.0 Median :15.00 Median : 8.136 2:15269
Mean :103167 Mean : 16.6 Mean : 168.9 Mean :10.109 Mean : 4.389 Mean :173.3 Mean :15.45 Mean : 7.691 3:11924
3rd Qu.:154868 3rd Qu.: 20.0 3rd Qu.: 203.0 3rd Qu.:13.111 3rd Qu.: 5.715 3rd Qu.:254.0 3rd Qu.:20.67 3rd Qu.:10.065 4:10150
Max. :206209 Max. :100.0 Max. :3690.0 Max. :60.667 Max. :38.859 Max. :365.0 Max. :30.00 Max. :17.321 5:11734
6:13937
busy.hour busy.product n.products n.departments n.aisles
10 :14647 Min. : 1 Min. : 2.00 Min. : 2.0 Min. : 2.00
9 :13998 1st Qu.:10341 1st Qu.: 30.00 1st Qu.:10.0 1st Qu.: 18.00
11 :12910 Median :21288 Median : 54.00 Median :13.0 Median : 28.00
12 :11170 Mean :21788 Mean : 69.83 Mean :12.2 Mean : 30.25
13 :10280 3rd Qu.:31678 3rd Qu.: 92.00 3rd Qu.:15.0 3rd Qu.: 40.00
14 : 9788 Max. :49683 Max. :729.00 Max. :22.0 Max. :103.00
(Other):58416
test_users
glimpse(test_users)
Observations: 75,000
Variables: 14
$ user_id <int> 3, 4, 6, 11, 12, 15, 16, 19, 20, 22, 25, 26, 28, 31, 32, 33, 35, 36, 39, 40, 45, 51, 54, 57, 58, 60, 61, 68, 69, 73, 75, ...
$ n.orders <int> 13, 6, 4, 8, 6, 23, 7, 10, 5, 16, 4, 13, 25, 21, 6, 4, 10, 38, 8, 10, 5, 4, 78, 15, 16, 9, 5, 7, 6, 8, 24, 13, 8, 6, 10, ...
$ n.items <int> 13, 6, 4, 8, 6, 23, 7, 10, 5, 16, 4, 13, 25, 21, 6, 4, 10, 38, 8, 10, 5, 4, 78, 15, 16, 9, 5, 7, 6, 8, 24, 13, 8, 6, 10, ...
$ avg.items <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ sd.items <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ n.wait.days <dbl> 144, 85, 40, 131, 130, 234, 116, 84, 45, 191, 90, 146, 277, 111, 111, 76, 192, 353, 79, 112, 63, 46, 364, 113, 137, 204, ...
$ avg.wait.days <dbl> 12.000000, 17.000000, 13.333333, 18.714286, 26.000000, 10.636364, 19.333333, 9.333333, 11.250000, 12.733333, 30.000000, 1...
$ sd.wait.days <dbl> 5.134553, 10.977249, 8.082904, 10.812251, 6.928203, 4.756404, 8.571270, 5.123475, 12.685293, 11.572546, 0.000000, 8.61112...
$ busy.day <fctr> 0, 4, 2, 5, 1, 1, 0, 5, 1, 5, 1, 5, 0, 3, 3, 1, 4, 0, 6, 0, 1, 0, 4, 6, 0, 2, 5, 5, 2, 3, 1, 0, 6, 2, 6, 3, 3, 5, 0, 5, ...
$ busy.hour <fctr> 16, 11, 16, 11, 9, 9, 14, 12, 11, 19, 14, 9, 17, 11, 12, 14, 16, 17, 12, 9, 20, 14, 11, 7, 10, 11, 10, 10, 11, 16, 13, 9...
$ busy.product <int> 39190, 35469, 21903, 8309, 7076, 14715, 5134, 17008, 9387, 22935, 22008, 7521, 24759, 12440, 9637, 7969, 4942, 11079, 258...
$ n.products <int> 34, 18, 13, 62, 62, 14, 47, 134, 8, 35, 20, 35, 107, 191, 81, 35, 91, 56, 69, 37, 18, 25, 251, 32, 66, 46, 80, 52, 31, 17...
$ n.departments <int> 10, 10, 6, 13, 15, 4, 9, 16, 3, 12, 12, 11, 16, 16, 15, 10, 13, 14, 14, 11, 6, 8, 16, 13, 16, 12, 17, 12, 12, 10, 15, 15,...
$ n.aisles <int> 17, 15, 9, 36, 34, 9, 18, 60, 7, 20, 18, 24, 46, 49, 36, 23, 39, 26, 32, 22, 13, 15, 68, 19, 39, 20, 46, 28, 24, 14, 43, ...
summary(test_users)
user_id n.orders n.items avg.items sd.items n.wait.days avg.wait.days sd.wait.days busy.day
Min. : 3 Min. : 4.00 Min. : 4.00 Min. :1 Min. :0 Min. : 1.0 Min. : 0.1667 Min. : 0.000 0:22724
1st Qu.: 51495 1st Qu.: 6.00 1st Qu.: 6.00 1st Qu.:1 1st Qu.:0 1st Qu.: 89.0 1st Qu.: 9.8485 1st Qu.: 5.391 1:16120
Median :103030 Median : 10.00 Median : 10.00 Median :1 Median :0 Median :151.0 Median :14.9474 Median : 8.103 2: 8668
Mean :102997 Mean : 16.57 Mean : 16.57 Mean :1 Mean :0 Mean :173.2 Mean :15.4551 Mean : 7.683 3: 6938
3rd Qu.:154295 3rd Qu.: 20.00 3rd Qu.: 20.00 3rd Qu.:1 3rd Qu.:0 3rd Qu.:253.0 3rd Qu.:20.6000
Max. :206208 Max. :100.00 Max. :100.00 Max. :1 Max. :0 Max. :365.0 Max. :30.0000 Max. :17.321 5: 6577
6: 8069
busy.hour busy.product n.products n.departments n.aisles
10 : 8491 Min. : 1 Min. : 2.00 Min. : 2.00 Min. : 2.00
9 : 8038 1st Qu.: 9580 1st Qu.: 26.00 1st Qu.: 9.00 1st Qu.: 16.00
11 : 7378 Median :21006 Median : 49.00 Median :12.00 Median : 26.00
12 : 6217 Mean :21308 Mean : 65.44 Mean :11.82 Mean : 28.75
13 : 5809 3rd Qu.:30827 3rd Qu.: 87.00 3rd Qu.:15.00 3rd Qu.: 39.00
14 : 5730 Max. :49683 Max. :644.00 Max. :21.00 Max. :101.00
(Other):33337
Let’s understand what we are selling. Products are categorized into aisles. Each department has multiple aisles.
Total number of departments: 21
Total number of aisles: 134
Total number of products: 49688
tmp = products %>%
group_by(department_id,aisle_id) %>%
summarize(count=n()) %>%
left_join(aisles, by='aisle_id') %>%
left_join(departments, by='department_id') %>%
mutate( onesize = 1, percentage = count / sum(count))
treemap(tmp, #Your data frame object
index=c("department","aisle"), #A list of your categorical variables
vSize = "onesize", #This is your quantitative variable
vColor="department",
type="index", #Type sets the organization and color scheme of your treemap
palette = "Set3", #Select your color palette from the RColorBrewer presets or make your own.
title="Our Departments and Aisles", #Customize your title
bg.labels = "yellow"
)
rm(list='tmp')
products %>%
group_by(department_id) %>%
summarize(n.products=n()) %>%
mutate( percentage = n.products / sum(n.products)) %>%
left_join(departments) %>%
geom_col() +
theme (
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))
products %>%
group_by(department_id, aisle_id) %>%
summarize(n.products = n()) %>%
ungroup(tmp) %>%
mutate(percentage = n.products/sum(n.products)) %>%
left_join(departments) %>%
left_join(aisles) %>%
select(department_id, department, aisle_id, aisle, n.products, percentage) %>%
arrange(desc(n.products))
Joining, by = "department_id"
Joining, by = "aisle_id"
orders %>%
ggplot( aes(x=order_dow)) +
geom_bar() + ylab('Number of Orders')
orders %>%
ggplot( aes(x=order_hour_of_day)) +
geom_bar() + ylab('Number of Orders')
orders %>%
group_by(user_id) %>%
summarize(n.orders=n()) %>%
group_by(n.orders) %>%
summarize (n.users = n()) %>%
ggplot(aes(x=n.orders, y=percentage)) +
geom_col() + labs(x='Number of Orders Per Customer', y='Percentage of Customers')
Days since prior order of 30 accounts for all orders made more than 30 days ago (about 11%)
Interesting to find out that the decreasing pattern is similar to order frequency
orders %>%
group_by(days_since_prior_order) %>%
summarize ( n.orders = n()) %>%
ggplot(aes(x=days_since_prior_order, y=percentage.of.orders)) +
geom_col()
rbind(order_products_prior, order_products_train) %>%
group_by(order_id) %>%
summarise(n_items=n()) %>%
ggplot(aes(x=n_items)) +
geom_bar() + labs(x='Number of Items per Order', y='Number of Customers') +
tmp = rbind(order_products_train, order_products_prior) %>%
left_join(products) %>%
left_join(departments) %>%
group_by(department) %>%
summarize(count=n()) %>%
mutate(percentage=count/sum(count))
ggplot (tmp, aes(x=reorder(department,-count), y=percentage)) +
geom_col() + ggtitle('Departments') + ylab('percentage.of.orders') +
theme (
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x = element_blank())
tmp %>% arrange(desc(count))
tmp = rbind(order_products_train, order_products_prior) %>%
left_join(products) %>%
left_join(aisles) %>%
group_by(aisle) %>%
summarize(count=n()) %>%
top_n(15, wt=count) %>%
mutate(percentage=count/sum(count))
ggplot (tmp, aes(x=reorder(aisle,-count), y=percentage)) +
geom_col() + ggtitle('Top 15 Aisles') + ylab('Percentage of Orders') +
theme (
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x = element_blank())
tmp %>% arrange(desc(count))
tmp = rbind(order_products_train, order_products_prior) %>%
left_join(products, by='product_id') %>%
group_by(product_name) %>%
summarize(count=n()) %>%
top_n(15, wt=count) %>%
mutate(percentage=count/sum(count))
ggplot (tmp, aes(x=reorder(product_name,-count), y=percentage)) +
geom_col() + ylab('Percentage of Orders') +
theme (
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x = element_blank())
tmp %>% arrange(desc(count))
Customer purchases proportioned to item order occurance.
tmp = rbind(order_products_train, order_products_prior) %>%
left_join(products) %>%
left_join(departments) %>%
left_join(aisles) %>%
group_by(department, aisle, product_name) %>%
summarize(count=n())
treemap(tmp, #Your data frame object
index=c("department","aisle"), #A list of your categorical variables
vSize = "count", #This is your quantitative variable
vColor="department",
type="index", #Type sets the organization and color scheme of your treemap
palette = "Set3", #Select your color palette from the RColorBrewer presets or make your own.
title="Customer Purchases Tree Map", #Customize your title
fontsize.title = 14, #Change the font size of the title
bg.labels = "yellow"
)
It seems things that customers buys doesn’t differ significantly througout the day.
rbind(order_products_train,order_products_prior) %>%
left_join(products) %>%
left_join(departments) %>%
left_join(orders) %>%
summarize(count = n()) %>%
ggplot (aes(x=order_hour_of_day, y=count, fill=department)) +
geom_col()+ ylab('Orders Quantity') + ggtitle('Orders over 24 Hours')
Top 5 products sold hourly are consistant, and it make up about 50% of hourly product sold.
rbind(order_products_train,order_products_prior) %>%
left_join(products) %>%
left_join(orders) %>%
group_by(order_hour_of_day, product_name) %>%
summarize(count = n()) %>%
top_n(n=5, wt=count) %>%
ggplot (aes(x=order_hour_of_day, y=count, fill=product_name)) +
geom_col() + ylab('Product Quantity') + ggtitle('Top 5 Purchased Product')
It seems things that customers buys doesn’t differ significantly througout the week !
rbind(order_products_train,order_products_prior) %>%
left_join(products) %>%
left_join(departments) %>%
left_join(orders) %>%
group_by(order_dow, department) %>%
summarize(count = n()) %>%
ggplot (aes(x=order_dow, y=count, fill=department)) +
geom_col() + ylab('Orders Quantity') + ggtitle('Number of Orders Daily')
Top 5 products are consistant daily, and it make up to 50% of daily product sold.
rbind(order_products_train,order_products_prior) %>%
left_join(products) %>%
left_join(orders) %>%
group_by(order_dow, product_name) %>%
summarize(count = n()) %>%
ggplot (aes(x=order_dow, y=count, fill=product_name)) +
geom_col() + ylab('Product Quantity') +ggtitle('Top 5 Product Sold Daily')
It is surprising to see that only 60% of products had been reordered.
tmp = rbind(order_products_train,order_products_prior) %>%
group_by(reordered) %>%
summarize(count = n()) %>%
mutate(probability = count/sum(count))
tmp %>%
ggplot(aes(x=factor(0), y=probability, fill=reordered)) +
geom_col(width=1) +
coord_polar(theta='y')
tmp
tmp = rbind(order_products_train,order_products_prior) %>%
left_join(products) %>%
group_by(product_name,reordered) %>%
summarize(count=n()) %>%
spread(reordered, count) %>%
select(product_name, yes=`1`, no=`0`) %>%
mutate( total = yes + no, yes.rate = yes/(yes+no), no.rate=no/(yes+no)) %>%
ungroup()
tmp %>%
top_n(15, wt=yes.rate) %>%
gather(reordered, rate ,5:6) %>%
arrange(desc(rate)) %>%
ggplot (aes(x=product_name, y=rate, fill=reordered)) +
theme (
axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),
axis.title.x = element_blank()) + ylab('Reordered Rate') + ggtitle('Top 15 Highest Reordered Rate Products')
tmp %>% arrange(desc(yes.rate))
rm(list='tmp')
How many do customer buy on their most active hour ?
rbind(train_users, test_users) %>%
ggplot ( aes(x=busy.hour, y=n.orders)) + geom_col() + ggtitle('Orders by User Busy Hour')
rbind(train_users, test_users) %>%
ggplot ( aes(x=busy.hour, y=n.items)) + geom_col() + ggtitle('Product Ordered by User Busy Hour')
How many do customer buy on their most active day ?
rbind(train_users, test_users) %>%
ggplot ( aes(x=busy.day, y=n.orders)) + geom_col() + ggtitle('Orders by User Busy Day')
rbind(train_users, test_users) %>%
ggplot ( aes(x=busy.day, y=n.items)) + geom_col() + ggtitle('Product Ordered by User Busy Day')
The correlation matrix give a general idea that frequent customers (who placed many orders) has strong correlation with days between orders and number of items they order.
rbind(train_users, test_users) %>%
select(-user_id) %>%
corrgram(order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of mtcars intercorrelations")
As general trend, frequent customers don’t wait too long before placing next order.
rbind(train_users, test_users) %>%
ggplot( aes(x=n.orders, y=avg.wait.days)) + geom_point() + geom_smooth() +
xlab('Number of Orders Per Customer')
Frequent buyers as they ordered more frequently, the items per order generally also reduced.
rbind(train_users, test_users) %>%
ggplot( aes(x=n.orders, y=avg.items)) + geom_point() + geom_smooth() +
xlab('Number of Orders Per Customer') + ylab('Average Number of Items Per Order')
Frequent buyers (loyal customers) consume more variety of products.
rbind(train_users, test_users) %>%
ggplot( aes(x=n.orders, y=n.products)) + geom_point() + geom_smooth() +
xlab('Number of Orders Per Customer') + ylab('Number of Unique Products Ordered')
F1score is the official measurement of scoring, measured per test order basis. - List of products need to be predicted per test order
- Compre the predicted list with actual prodcut list, measure F1 score
- Final F1 score = average f1 score for all test orders
This custom function measures F1 score by having predicted products and actual products purchased:
- parse the two strings into two charactor vectors, representing predicted vector and actual vector
- calculate x, as how many correctly predicted (exist in both predicted and actual vectors)
- calculate precision and tpr
- calculate f1score
Repeat this F1score for all test orders. Final F1 score is the average score for all orders.
require(stringr)
Loading required package: stringr
f1score <- function(list_a, list_b) {
list_a <- str_split(list_a, ' ')[[1]]
x <- length(intersect(list_a, list_b))
pr <- x / length(list_b)
re <- x / length(list_a)
f1 <- 0
if (pr + re) {
f1 <- 2 * pr * re / (pr + re)
}
return(f1)
}
tmp = train_orders %>%
left_join(rbind(order_products_train,prior_products_train)) %>%
group_by(user_id, order_id, days_since_prior_order) %>%
summarize(n.items = n())
Joining, by = "order_id"
tmp %>%
group_by(user_id) %>%
summarize(
n.orders=n(),
t.items = sum(n.items),
avg.items=mean(n.items, na.rm=TRUE),
sd.items=sd(n.items, na.rm=TRUE),
n.wait.days=sum(days_since_prior_order, na.rm=TRUE),
avg.wait.days=mean(days_since_prior_order,na.rm=TRUE),
sd.wait.days=sd(days_since_prior_order,na.rm=TRUE))