setwd("/Users/jessymin/Documents/instacart_dataset/data")
department <- read_csv('departments.csv')
## Parsed with column specification:
## cols(
## department_id = col_integer(),
## department = col_character()
## )
aisle <- read_csv('aisles.csv')
## Parsed with column specification:
## cols(
## aisle_id = col_integer(),
## aisle = col_character()
## )
product <- read_csv('products.csv')
## Parsed with column specification:
## cols(
## X = col_integer(),
## product_id = col_integer(),
## product_name = col_character(),
## aisle_id = col_integer(),
## department_id = col_integer()
## )
product <- product[,-1]
order <- read_csv('orders.csv')
## Parsed with column specification:
## cols(
## order_id = col_integer(),
## user_id = col_integer(),
## eval_set = col_character(),
## order_number = col_integer(),
## order_dow = col_integer(),
## order_hour_of_day = col_character(),
## days_since_prior_order = col_double()
## )
order_products__prior <- read_csv('order_products__prior.csv')
## Parsed with column specification:
## cols(
## order_id = col_integer(),
## product_id = col_integer(),
## add_to_cart_order = col_integer(),
## reordered = col_integer()
## )
order_products__train <- read_csv('order_products__train.csv')
## Parsed with column specification:
## cols(
## order_id = col_integer(),
## product_id = col_integer(),
## add_to_cart_order = col_integer(),
## reordered = col_integer()
## )
order_products <- rbind(order_products__prior, order_products__train)
kable(head(aisle), caption="table 'aisle")
| aisle_id | aisle |
|---|---|
| 1 | prepared soups salads |
| 2 | specialty cheeses |
| 3 | energy granola bars |
| 4 | instant foods |
| 5 | marinades meat preparation |
| 6 | other |
#분석 대상 유저 수
length(unique(order$user_id))
## [1] 206209
kable(head(product), caption="table 'product")
| product_id | product_name | aisle_id | department_id |
|---|---|---|---|
| 1 | Chocolate Sandwich Cookies | 61 | 19 |
| 2 | All-Seasons Salt | 104 | 13 |
| 3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
| 4 | Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce | 38 | 1 |
| 5 | Green Chile Anytime Sauce | 5 | 13 |
| 6 | Dry Nose Oil | 11 | 11 |
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
d <- order %>% group_by(user_id) %>% summarise(count(order_id))
d <- sqldf(‘SELECT user_id, COUNT(order_id) AS order_count FROM order GROUP BY user_id;’)
ggplot(order, aes(days_since_prior_order))+geom_histogram(binwidth=1, stat=‘count’, fill=‘red’)
–> 0day, 7day, 30day
user_7days <- order %>% filter(days_since_prior_order == 7) length(unique(user_7days$user_id)) #100,332 user
ggplot(user_7days, aes(order_dow)) + geom_bar(stat=‘count’, fill=‘steelblue’)
ggplot(order, aes(order_dow)) + geom_bar(stat=‘count’, fill=‘steelblue’)
user_6days <- order %>% filter(days_since_prior_order == 6) ggplot(user_6days, aes(order_dow)) + geom_bar(stat=‘count’, fill=‘steelblue’) length(unique(user_6days$user_id)) #90,205 user
length(unique(order$user_id))
reorder <- filter(order, days_since_prior_order == 0) reorder_all <- left_join(reorder, order_products, by=‘order_id’) reorder_all <- left_join(reorder_all, product, by=‘product_id’)
df <- reorder %>% group_by(product_name) %>% summarise(count=n()) %>% arrange(desc(count)) ## which dow? table(reorder\(order_dow) table(reorder\)order_hour_of_day) table(reorder\(order_dow, reorder\)order_hour_of_day) ggplot(reorder, aes(order_hour_of_day, fill=order_dow)) + geom_bar(stat=‘count’)