setwd("/Users/jessymin/Documents/instacart_dataset/data")
department <- read_csv('departments.csv')
## Parsed with column specification:
## cols(
##   department_id = col_integer(),
##   department = col_character()
## )
aisle <- read_csv('aisles.csv')
## Parsed with column specification:
## cols(
##   aisle_id = col_integer(),
##   aisle = col_character()
## )
product <- read_csv('products.csv')
## Parsed with column specification:
## cols(
##   X = col_integer(),
##   product_id = col_integer(),
##   product_name = col_character(),
##   aisle_id = col_integer(),
##   department_id = col_integer()
## )
product <- product[,-1]
order <- read_csv('orders.csv')
## Parsed with column specification:
## cols(
##   order_id = col_integer(),
##   user_id = col_integer(),
##   eval_set = col_character(),
##   order_number = col_integer(),
##   order_dow = col_integer(),
##   order_hour_of_day = col_character(),
##   days_since_prior_order = col_double()
## )
order_products__prior <- read_csv('order_products__prior.csv')
## Parsed with column specification:
## cols(
##   order_id = col_integer(),
##   product_id = col_integer(),
##   add_to_cart_order = col_integer(),
##   reordered = col_integer()
## )
order_products__train <- read_csv('order_products__train.csv')
## Parsed with column specification:
## cols(
##   order_id = col_integer(),
##   product_id = col_integer(),
##   add_to_cart_order = col_integer(),
##   reordered = col_integer()
## )
order_products <- rbind(order_products__prior, order_products__train)

1. Understanding the data

1-1. 테이블 개요

kable(head(aisle), caption="table 'aisle")
table ’aisle
aisle_id aisle
1 prepared soups salads
2 specialty cheeses
3 energy granola bars
4 instant foods
5 marinades meat preparation
6 other
#분석 대상 유저 수
length(unique(order$user_id))
## [1] 206209
kable(head(product), caption="table 'product")
table ’product
product_id product_name aisle_id department_id
1 Chocolate Sandwich Cookies 61 19
2 All-Seasons Salt 104 13
3 Robust Golden Unsweetened Oolong Tea 94 7
4 Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce 38 1
5 Green Chile Anytime Sauce 5 13
6 Dry Nose Oil 11 11

3. 주문 주기 분석(days_since_prior_order)

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Understanding dataset

1.data extraction

predicting 75,000 users –> ratio of order_dow is same with entire users

training with 131,209 users

number of orders by users

d <- order %>% group_by(user_id) %>% summarise(count(order_id))

d <- sqldf(‘SELECT user_id, COUNT(order_id) AS order_count FROM order GROUP BY user_id;’)

overview of days_since_proir_order

ggplot(order, aes(days_since_prior_order))+geom_histogram(binwidth=1, stat=‘count’, fill=‘red’)

–> 0day, 7day, 30day

who is order in 7 days?

user_7days <- order %>% filter(days_since_prior_order == 7) length(unique(user_7days$user_id)) #100,332 user

is day of week different?

ggplot(user_7days, aes(order_dow)) + geom_bar(stat=‘count’, fill=‘steelblue’)

comparision with all users

ggplot(order, aes(order_dow)) + geom_bar(stat=‘count’, fill=‘steelblue’)

who is ordering in 6 days?

user_6days <- order %>% filter(days_since_prior_order == 6) ggplot(user_6days, aes(order_dow)) + geom_bar(stat=‘count’, fill=‘steelblue’) length(unique(user_6days$user_id)) #90,205 user

length(unique(order$user_id))

days_since_prior_order == 0 order

reorder <- filter(order, days_since_prior_order == 0) reorder_all <- left_join(reorder, order_products, by=‘order_id’) reorder_all <- left_join(reorder_all, product, by=‘product_id’)

df <- reorder %>% group_by(product_name) %>% summarise(count=n()) %>% arrange(desc(count)) ## which dow? table(reorder\(order_dow) table(reorder\)order_hour_of_day) table(reorder\(order_dow, reorder\)order_hour_of_day) ggplot(reorder, aes(order_hour_of_day, fill=order_dow)) + geom_bar(stat=‘count’)