library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(stringr)
library(DT)
order_products = read.csv('dataset/instacartData/order_products__prior.csv')
orders = read.csv('dataset/instacartData/orders.csv')
departments = read.csv('dataset/instacartData/departments.csv')
products = read.csv('dataset/instacartData/products.csv')
aisles = read.csv("dataset/instacartData/aisles.csv")
stat =
order_products %>% left_join(orders, by='order_id') %>% left_join(products, by="product_id") %>%
left_join(departments, by = 'department_id') %>%
group_by(department, order_hour_of_day) %>%
summarise(n=n())
ggplot(stat, aes(x = order_hour_of_day, y = n)) +
geom_bar(stat = 'identity') + facet_grid(~department)
glimpse(orders)
## Observations: 68,422
## Variables: 7
## $ order_id <int> 1671983, 3071077, 1564518, 2099081, 12271…
## $ user_id <dbl> 88338, 172869, 134470, 205871, 176700, 84…
## $ eval_set <fct> prior, prior, prior, prior, prior, prior,…
## $ order_number <int> 2, 14, 3, 6, 24, 40, 17, 21, 2, 37, 41, 1…
## $ order_dow <int> 0, 1, 1, 2, 0, 6, 2, 0, 0, 4, 1, 6, 3, 6,…
## $ order_hour_of_day <int> 18, 11, 16, 11, 11, 10, 12, 10, 14, 14, 1…
## $ days_since_prior_order <int> 30, 12, 7, 30, 0, 6, 21, 4, 16, 4, 4, NA,…
orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
*대부분 8시 ~ 18시 사이에 고객들이 주문함
orders %>%
ggplot(aes(x = order_hour_of_day)) +
geom_histogram(stat="count", fill = "green")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
*요일별로도 살펴보자
orders %>%
ggplot(aes (x=order_dow)) +
geom_histogram(stat = "count", fill = "red")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
*언제 재주문 하나?
orders %>%
ggplot(aes( x = days_since_prior_order)) +
geom_histogram(stat = "count", fill = "blue")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 4032 rows containing non-finite values (stat_count).
*그렇다면 얼마나 이전에 주문을 했나?
# We can see that there are always at least 3 prior orders.
orders %>% filter(eval_set == "prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color = "red", size=1) + geom_point(size=2, color="red")
*얼마나 많이 상품을 살까? 주문에 얼마나 많은 상품을 담았을까?
order_products %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order)) %>%
ggplot(aes(x=n_items))+
geom_histogram(stat="count",fill="red") +
geom_rug()+
coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
head(order_products)
## order_id product_id add_to_cart_order reordered
## 1 53 24852 1 1
## 2 53 37646 2 1
## 3 53 47209 3 1
## 4 53 36606 4 0
## 5 53 32442 5 1
## 6 53 14467 6 1
*베스트 셀링 상품
tmp <- order_products %>%
group_by(product_id) %>%
summarize(count = n()) %>%
top_n(10, wt = count) %>%
left_join(select(products,product_id,product_name),by="product_id") %>%
arrange(desc(count))
kable(tmp)
| product_id | count | product_name |
|---|---|---|
| 24852 | 9434 | Banana |
| 13176 | 7731 | Bag of Organic Bananas |
| 21137 | 5391 | Organic Strawberries |
| 21903 | 4898 | Organic Baby Spinach |
| 47209 | 4321 | Organic Hass Avocado |
| 47766 | 3595 | Organic Avocado |
| 47626 | 3156 | Large Lemon |
| 26209 | 2851 | Limes |
| 16797 | 2792 | Strawberries |
| 27845 | 2753 | Organic Whole Milk |
| - kable 함수를 | 쓸 때 k | nitr패키지를 설치하고 라이브러리로 불러와야함 |
| - 함수 안에 표 | 에 표시하 | 기를 원하는 데이터명을 기재하기만 하면, 별도의 제목행 구분 등의 작업 없이도 간단히 표를 그릴 수 있다 |
| - geom_bar 에 | 서는 sta | t 을 반드시 identity 라고 해줘야함 |
| - identity는 | y축 값의 | 높이를 데이터를 기반으로 정해줄 때 사용해줍니다. |
| - 즉, stat=’i | dentity’ | 는 y축의 높이를 데이터의 값으로 하는 bar그래프의 형태로 지정한다는 것입니다. |
tmp %>%
ggplot(aes(x=reorder(product_name, -count), y = count)) +
geom_bar(stat="identity", fill = "red") +
theme(axis.text.x=element_text(angle = 90, hjust = 1), axis.title.x = element_blank())
*How often do people order the same items again? :주문된 상품의 59%가 모두 재주문 된것들
tmp <- order_products %>%
group_by(reordered) %>%
summarize(count = n()) %>%
mutate(reordered = as.factor(reordered)) %>%
mutate(proportion = count/sum(count))
kable(tmp)
| reordered | count | proportion |
|---|---|---|
| 0 | 264842 | 0.4069246 |
| 1 | 385996 | 0.5930754 |
tmp %>%
ggplot(aes(x=reordered,y=count,fill=reordered))+
geom_bar(stat="identity")
*가장 자주 재주문 된 상품은?
tmp <- order_products %>%
group_by(product_id) %>%
summarize(proportion_reordered = mean(reordered), n=n()) %>%
filter(n > 40) %>%
top_n(10, wt = proportion_reordered) %>%
arrange(desc(proportion_reordered)) %>%
left_join(products, by = "product_id")
kable(tmp)
| product_id | proportion_reordered | n | product_name | aisle_id | department_id |
|---|---|---|---|---|---|
| 6286 | 0.9130435 | 46 | Half and Half | 53 | 16 |
| 33572 | 0.9038462 | 156 | Lactose Free 2% Reduced Fat Milk | 91 | 16 |
| 43394 | 0.8926554 | 177 | Organic Lactose Free Whole Milk | 91 | 16 |
| 43209 | 0.8913043 | 46 | Large Eggs | 86 | 16 |
| 5514 | 0.8823529 | 85 | Organic Homogenized Whole Milk | 84 | 16 |
| 9292 | 0.8750000 | 64 | Half And Half Ultra Pasteurized | 84 | 16 |
| 13560 | 0.8723404 | 47 | A+ Cinnamon Sweet Potato Pancakes | 52 | 1 |
| 39180 | 0.8694268 | 314 | Organic Lowfat 1% Milk | 84 | 16 |
| 45504 | 0.8592965 | 199 | Whole Organic Omega 3 Milk | 84 | 16 |
| 2748 | 0.8571429 | 112 | Organic Reduced Fat Omega-3 Milk | 84 | 16 |
| 26364 | 0.8571429 | 49 | Kefir Cultured Milk Smoothie, Whole Milk, Sublime Slime Lime | 120 | 16 |
tmp <-order_products %>%
group_by(product_id) %>%
summarize(proportion_reordered = mean(reordered), n=n()) %>%
filter(n>40) %>%
top_n(10,wt=proportion_reordered) %>%
arrange(desc(proportion_reordered)) %>%
left_join(products,by="product_id")
kable(tmp)
| product_id | proportion_reordered | n | product_name | aisle_id | department_id |
|---|---|---|---|---|---|
| 6286 | 0.9130435 | 46 | Half and Half | 53 | 16 |
| 33572 | 0.9038462 | 156 | Lactose Free 2% Reduced Fat Milk | 91 | 16 |
| 43394 | 0.8926554 | 177 | Organic Lactose Free Whole Milk | 91 | 16 |
| 43209 | 0.8913043 | 46 | Large Eggs | 86 | 16 |
| 5514 | 0.8823529 | 85 | Organic Homogenized Whole Milk | 84 | 16 |
| 9292 | 0.8750000 | 64 | Half And Half Ultra Pasteurized | 84 | 16 |
| 13560 | 0.8723404 | 47 | A+ Cinnamon Sweet Potato Pancakes | 52 | 1 |
| 39180 | 0.8694268 | 314 | Organic Lowfat 1% Milk | 84 | 16 |
| 45504 | 0.8592965 | 199 | Whole Organic Omega 3 Milk | 84 | 16 |
| 2748 | 0.8571429 | 112 | Organic Reduced Fat Omega-3 Milk | 84 | 16 |
| 26364 | 0.8571429 | 49 | Kefir Cultured Milk Smoothie, Whole Milk, Sublime Slime Lime | 120 | 16 |
tmp %>%
ggplot(aes(x=reorder(product_name, -proportion_reordered), y = proportion_reordered)) +
geom_bar(stat="identity", fill = "red") +
theme(axis.text.x=element_text(angle=90, hjust=1), axis.title = element_blank()) + coord_cartesian(ylim=c(0.85, 0.95))
고객들이 처음 카트에 담는 상품은? pct = put cart first
tmp <- order_products %>%
group_by(product_id, add_to_cart_order) %>%
summarize(count = n()) %>% mutate(pct = count/sum(count)) %>%
filter(add_to_cart_order == 1, count>10) %>%
arrange(desc(pct)) %>%
left_join(products,by="product_id") %>%
select(product_name, pct, count) %>%
ungroup() %>%
top_n(10, wt=pct) # top(n=출력할 개수, wt = 지정변수)
## Adding missing grouping variables: `product_id`
kable(tmp)
| product_id | product_name | pct | count |
|---|---|---|---|
| 6729 | Cookie Tray | 0.4642857 | 13 |
| 37919 | Organic 2% Chocolate Milk | 0.4626866 | 31 |
| 20518 | Club Delight Sausage, Egg, & Cheese Sandwiches | 0.4444444 | 16 |
| 22046 | Frozen Whole Strawberries | 0.4444444 | 16 |
| 35470 | Water Mineral | 0.4285714 | 18 |
| 20940 | Organic Low Fat Milk | 0.4259259 | 69 |
| 39643 | Organic Dairy Iron-Fortified Toddler Formula | 0.4230769 | 11 |
| 40939 | Drinking Water | 0.4062500 | 39 |
| 37774 | Artesian Sparkling Water | 0.4054054 | 15 |
| 1729 | 2% Lactose Free Milk | 0.4000000 | 16 |
| 45190 | Vodka | 0.4000000 | 44 |
시각화 reorder(name, mtcars\(mpg)라는 값을 x축에 넣었습니다. *이건 name을 x축에 넣을 때 그 mpg가 작은 것부터 앞에 오도록 순서를 조정하라는 뜻입니다. *만약 값이 큰 것부터 작은 것 순서로 정렬하게 싶을 때는 reorder(name, -mtcars\)mpg)라고 마이너스(-)만 붙이면 됩니다.
tmp %>%
ggplot(aes(x=reorder(product_name, -pct), y = pct)) +
geom_bar(stat = "identity", fill = "red") +
theme(axis.text.x = element_text(angle = 50, hjust = 1), axis.title = element_blank()) + coord_cartesian(ylim = c(0.4,0.7))
마지막 주문 시간과 재주문 사이의 관련 가능성 고객들이 같은 날에 주문을 다시한다면 동일 상품을 많이 주문하는데.. 30일 이후에 재주문함 *고객들은 주문에서 새로운 것을 시도하려고 함
order_products %>%
left_join(orders, by = "order_id") %>%
group_by(days_since_prior_order) %>%
summarize(mean_reorder = mean(reordered)) %>%
ggplot(aes(x=days_since_prior_order, y = mean_reorder)) +
geom_bar(stat="identity", fill = "blue")
## Warning: Removed 1 rows containing missing values (position_stack).
주문수와 재주문 가능성의 관계 많은 수의 주문의 상품은 자연스럽게 재주문 하는 경향을 보임 하지만 천장 효과임..
order_products %>%
group_by(product_id) %>%
summarize(proportion_reorderd = mean(reordered), n=n()) %>%
ggplot(aes(x=n, y=proportion_reorderd)) +
geom_point() +
geom_smooth(color = "red") +
coord_cartesian(xlim = c(0,2000))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
*Organic vs Non-organic(주문 비율을 보자)
products <- products %>%
mutate(organic=ifelse(str_detect(str_to_lower(products$product_name),'organic'),"organic","not organic"), organic= as.factor(organic))
tmp <- order_products %>%
left_join(products, by="product_id") %>%
group_by(organic) %>%
summarize(count = n()) %>%
mutate(proportion = count/sum(count))
kable(tmp)
| organic | count | proportion |
|---|---|---|
| not organic | 444775 | 0.6833882 |
| organic | 206063 | 0.3166118 |
tmp %>%
ggplot(aes(x=organic,y=count, fill=organic))+
geom_bar(stat="identity")
*Organic vs Non-Organic 재주문율
tmp <- order_products %>% left_join(products, by ="product_id") %>% group_by(organic) %>% summarize(mean_reordered = mean(reordered))
kable(tmp)
| organic | mean_reordered |
|---|---|
| not organic | 0.5717003 |
| organic | 0.6392123 |
tmp %>%
ggplot(aes(x=organic, y= mean_reordered, fill = organic)) +
geom_bar(stat = "identity")
*Treemap
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(treemap)
tmp <- products %>% group_by(department_id, aisle_id) %>% summarize(n=n())
tmp <- tmp %>% left_join(departments,by="department_id")
tmp <- tmp %>% left_join(aisles,by="aisle_id")
tmp2<-order_products %>%
group_by(product_id) %>%
summarize(count=n()) %>%
left_join(products,by="product_id") %>%
ungroup() %>%
group_by(department_id,aisle_id) %>%
summarize(sumcount = sum(count)) %>%
left_join(tmp, by = c("department_id", "aisle_id")) %>%
mutate(onesize = 1)
treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",title="",sortID="-sumcount", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels = "#FFFFFF")