library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(stringr)
library(DT)
order_products = read.csv('dataset/instacartData/order_products__prior.csv')
orders = read.csv('dataset/instacartData/orders.csv')
departments = read.csv('dataset/instacartData/departments.csv')
products = read.csv('dataset/instacartData/products.csv')
aisles = read.csv("dataset/instacartData/aisles.csv")

stat = 
  order_products %>% left_join(orders, by='order_id') %>% left_join(products, by="product_id") %>%
    left_join(departments, by = 'department_id') %>%
    group_by(department, order_hour_of_day) %>%
    summarise(n=n())
ggplot(stat, aes(x = order_hour_of_day, y = n)) +
  geom_bar(stat = 'identity') + facet_grid(~department)

glimpse(orders)
## Observations: 68,422
## Variables: 7
## $ order_id               <int> 1671983, 3071077, 1564518, 2099081, 12271…
## $ user_id                <dbl> 88338, 172869, 134470, 205871, 176700, 84…
## $ eval_set               <fct> prior, prior, prior, prior, prior, prior,…
## $ order_number           <int> 2, 14, 3, 6, 24, 40, 17, 21, 2, 37, 41, 1…
## $ order_dow              <int> 0, 1, 1, 2, 0, 6, 2, 0, 0, 4, 1, 6, 3, 6,…
## $ order_hour_of_day      <int> 18, 11, 16, 11, 11, 10, 12, 10, 14, 14, 1…
## $ days_since_prior_order <int> 30, 12, 7, 30, 0, 6, 21, 4, 16, 4, 4, NA,…
orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))

*대부분 8시 ~ 18시 사이에 고객들이 주문함

orders %>%
  ggplot(aes(x = order_hour_of_day)) +
  geom_histogram(stat="count", fill = "green")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

*요일별로도 살펴보자

orders %>%
  ggplot(aes (x=order_dow)) +
  geom_histogram(stat = "count", fill = "red")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

*언제 재주문 하나?

orders %>%
  ggplot(aes( x = days_since_prior_order)) +
  geom_histogram(stat = "count", fill = "blue")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 4032 rows containing non-finite values (stat_count).

*그렇다면 얼마나 이전에 주문을 했나?

# We can see that there are always at least 3 prior orders.
orders %>% filter(eval_set == "prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color = "red", size=1) + geom_point(size=2, color="red")

*얼마나 많이 상품을 살까? 주문에 얼마나 많은 상품을 담았을까?

order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>%
  ggplot(aes(x=n_items))+
  geom_histogram(stat="count",fill="red") + 
  geom_rug()+
  coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

head(order_products)
##   order_id product_id add_to_cart_order reordered
## 1       53      24852                 1         1
## 2       53      37646                 2         1
## 3       53      47209                 3         1
## 4       53      36606                 4         0
## 5       53      32442                 5         1
## 6       53      14467                 6         1

*베스트 셀링 상품

tmp <- order_products %>% 
  group_by(product_id) %>% 
  summarize(count = n()) %>% 
  top_n(10, wt = count) %>%
  left_join(select(products,product_id,product_name),by="product_id") %>%
  arrange(desc(count)) 
kable(tmp)
product_id count product_name
24852 9434 Banana
13176 7731 Bag of Organic Bananas
21137 5391 Organic Strawberries
21903 4898 Organic Baby Spinach
47209 4321 Organic Hass Avocado
47766 3595 Organic Avocado
47626 3156 Large Lemon
26209 2851 Limes
16797 2792 Strawberries
27845 2753 Organic Whole Milk
- kable 함수를 쓸 때 k nitr패키지를 설치하고 라이브러리로 불러와야함
- 함수 안에 표 에 표시하 기를 원하는 데이터명을 기재하기만 하면, 별도의 제목행 구분 등의 작업 없이도 간단히 표를 그릴 수 있다
- geom_bar 에 서는 sta t 을 반드시 identity 라고 해줘야함
- identity는 y축 값의 높이를 데이터를 기반으로 정해줄 때 사용해줍니다.
- 즉, stat=’i dentity’ 는 y축의 높이를 데이터의 값으로 하는 bar그래프의 형태로 지정한다는 것입니다.
tmp %>% 
  ggplot(aes(x=reorder(product_name, -count), y = count)) +
  geom_bar(stat="identity", fill = "red") +
  theme(axis.text.x=element_text(angle = 90, hjust = 1), axis.title.x = element_blank())

*How often do people order the same items again? :주문된 상품의 59%가 모두 재주문 된것들

tmp <-  order_products %>% 
  group_by(reordered) %>% 
  summarize(count = n()) %>% 
  mutate(reordered = as.factor(reordered)) %>% 
  mutate(proportion = count/sum(count))
kable(tmp)
reordered count proportion
0 264842 0.4069246
1 385996 0.5930754
tmp %>% 
  ggplot(aes(x=reordered,y=count,fill=reordered))+
  geom_bar(stat="identity")

*가장 자주 재주문 된 상품은?

tmp <-  order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reordered = mean(reordered), n=n()) %>% 
  filter(n > 40) %>% 
  top_n(10, wt = proportion_reordered) %>% 
  arrange(desc(proportion_reordered)) %>% 
  left_join(products, by = "product_id")
kable(tmp)
product_id proportion_reordered n product_name aisle_id department_id
6286 0.9130435 46 Half and Half 53 16
33572 0.9038462 156 Lactose Free 2% Reduced Fat Milk 91 16
43394 0.8926554 177 Organic Lactose Free Whole Milk 91 16
43209 0.8913043 46 Large Eggs 86 16
5514 0.8823529 85 Organic Homogenized Whole Milk 84 16
9292 0.8750000 64 Half And Half Ultra Pasteurized 84 16
13560 0.8723404 47 A+ Cinnamon Sweet Potato Pancakes 52 1
39180 0.8694268 314 Organic Lowfat 1% Milk 84 16
45504 0.8592965 199 Whole Organic Omega 3 Milk 84 16
2748 0.8571429 112 Organic Reduced Fat Omega-3 Milk 84 16
26364 0.8571429 49 Kefir Cultured Milk Smoothie, Whole Milk, Sublime Slime Lime 120 16
tmp <-order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reordered = mean(reordered), n=n()) %>% 
  filter(n>40) %>% 
  top_n(10,wt=proportion_reordered) %>% 
  arrange(desc(proportion_reordered)) %>% 
  left_join(products,by="product_id")

kable(tmp)
product_id proportion_reordered n product_name aisle_id department_id
6286 0.9130435 46 Half and Half 53 16
33572 0.9038462 156 Lactose Free 2% Reduced Fat Milk 91 16
43394 0.8926554 177 Organic Lactose Free Whole Milk 91 16
43209 0.8913043 46 Large Eggs 86 16
5514 0.8823529 85 Organic Homogenized Whole Milk 84 16
9292 0.8750000 64 Half And Half Ultra Pasteurized 84 16
13560 0.8723404 47 A+ Cinnamon Sweet Potato Pancakes 52 1
39180 0.8694268 314 Organic Lowfat 1% Milk 84 16
45504 0.8592965 199 Whole Organic Omega 3 Milk 84 16
2748 0.8571429 112 Organic Reduced Fat Omega-3 Milk 84 16
26364 0.8571429 49 Kefir Cultured Milk Smoothie, Whole Milk, Sublime Slime Lime 120 16
tmp %>% 
  ggplot(aes(x=reorder(product_name, -proportion_reordered), y = proportion_reordered)) +
  geom_bar(stat="identity", fill = "red") +
  theme(axis.text.x=element_text(angle=90, hjust=1), axis.title =  element_blank()) + coord_cartesian(ylim=c(0.85, 0.95))

고객들이 처음 카트에 담는 상품은? pct = put cart first

tmp <- order_products %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% mutate(pct = count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by="product_id") %>% 
  select(product_name, pct, count) %>% 
  ungroup() %>% 
  top_n(10, wt=pct) # top(n=출력할 개수, wt = 지정변수)
## Adding missing grouping variables: `product_id`
kable(tmp)
product_id product_name pct count
6729 Cookie Tray 0.4642857 13
37919 Organic 2% Chocolate Milk 0.4626866 31
20518 Club Delight Sausage, Egg, & Cheese Sandwiches 0.4444444 16
22046 Frozen Whole Strawberries 0.4444444 16
35470 Water Mineral 0.4285714 18
20940 Organic Low Fat Milk 0.4259259 69
39643 Organic Dairy Iron-Fortified Toddler Formula 0.4230769 11
40939 Drinking Water 0.4062500 39
37774 Artesian Sparkling Water 0.4054054 15
1729 2% Lactose Free Milk 0.4000000 16
45190 Vodka 0.4000000 44

시각화 reorder(name, mtcars\(mpg)라는 값을 x축에 넣었습니다. *이건 name을 x축에 넣을 때 그 mpg가 작은 것부터 앞에 오도록 순서를 조정하라는 뜻입니다. *만약 값이 큰 것부터 작은 것 순서로 정렬하게 싶을 때는 reorder(name, -mtcars\)mpg)라고 마이너스(-)만 붙이면 됩니다.

tmp %>% 
  ggplot(aes(x=reorder(product_name, -pct), y = pct)) +
  geom_bar(stat = "identity", fill = "red") +
  theme(axis.text.x = element_text(angle = 50, hjust = 1), axis.title = element_blank()) + coord_cartesian(ylim = c(0.4,0.7))

마지막 주문 시간과 재주문 사이의 관련 가능성 고객들이 같은 날에 주문을 다시한다면 동일 상품을 많이 주문하는데.. 30일 이후에 재주문함 *고객들은 주문에서 새로운 것을 시도하려고 함

order_products %>% 
  left_join(orders, by = "order_id") %>% 
  group_by(days_since_prior_order) %>% 
  summarize(mean_reorder = mean(reordered)) %>% 
  ggplot(aes(x=days_since_prior_order, y = mean_reorder)) +
  geom_bar(stat="identity", fill = "blue")
## Warning: Removed 1 rows containing missing values (position_stack).

주문수와 재주문 가능성의 관계 많은 수의 주문의 상품은 자연스럽게 재주문 하는 경향을 보임 하지만 천장 효과임..

order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reorderd = mean(reordered), n=n()) %>% 
  ggplot(aes(x=n, y=proportion_reorderd)) +
  geom_point() +
  geom_smooth(color = "red") +
  coord_cartesian(xlim = c(0,2000))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

*Organic vs Non-organic(주문 비율을 보자)

products <- products %>% 
    mutate(organic=ifelse(str_detect(str_to_lower(products$product_name),'organic'),"organic","not organic"), organic= as.factor(organic))
tmp <- order_products %>% 
  left_join(products, by="product_id") %>% 
  group_by(organic) %>% 
  summarize(count = n()) %>% 
  mutate(proportion = count/sum(count))
kable(tmp)
organic count proportion
not organic 444775 0.6833882
organic 206063 0.3166118
tmp %>% 
  ggplot(aes(x=organic,y=count, fill=organic))+
  geom_bar(stat="identity")

*Organic vs Non-Organic 재주문율

tmp <- order_products %>%  left_join(products, by ="product_id") %>% group_by(organic) %>% summarize(mean_reordered = mean(reordered))
kable(tmp)
organic mean_reordered
not organic 0.5717003
organic 0.6392123
tmp %>% 
  ggplot(aes(x=organic, y= mean_reordered, fill = organic)) +
  geom_bar(stat = "identity")

*Treemap

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(treemap)

tmp <- products %>% group_by(department_id, aisle_id) %>% summarize(n=n())
tmp <- tmp %>% left_join(departments,by="department_id")
tmp <- tmp %>% left_join(aisles,by="aisle_id")

tmp2<-order_products %>% 
  group_by(product_id) %>% 
  summarize(count=n()) %>% 
  left_join(products,by="product_id") %>% 
  ungroup() %>% 
  group_by(department_id,aisle_id) %>% 
  summarize(sumcount = sum(count)) %>% 
  left_join(tmp, by = c("department_id", "aisle_id")) %>% 
  mutate(onesize = 1)
treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",title="",sortID="-sumcount", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels = "#FFFFFF")