Recode

orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))

*When do people order? (HOUR)

orders %>% 
  ggplot(aes(x=order_hour_of_day)) + 
  geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

When do people order? (DAY OF WEEK)

NotFancy <- function(l) {
 l <- format(l, scientific = TRUE)
 parse(text=l)
}

orders %>% 
  ggplot(aes(x=order_dow)) + 
  geom_line(stat="count",fill="red")+theme_economist()+xlab("DAY OF WEEK")+
     scale_y_continuous(labels=NotFancy) +ylab("Total Order Counts")
## Warning: Ignoring unknown parameters: fill

When do they order again?

orders %>% 
  ggplot(aes(x=days_since_prior_order)) + 
  geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).

How many prior orders are there?

orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")+theme_economist()

How many items do people buy?

order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>%
  ggplot(aes(x=n_items))+
  geom_histogram(stat="count",fill="red")+theme_economist()+ 
  geom_rug()+
  coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Bestsellers

top10_ordered_products <- order_products %>% 
  group_by(product_id) %>% 
  summarize(count = n()) %>% 
  top_n(10, wt = count) %>%
  left_join(select(products,product_id,product_name),by="product_id") %>%
  arrange(desc(count)) 


top10_ordered_products %>% 
  ggplot(aes(x=reorder(product_name,-count), y=count))+
  geom_bar(stat="identity",fill="blue")+theme_wsj()+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())

Most often reordered

top10_reordered_products <-order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reordered = mean(reordered), n=n()) %>% 
  filter(n>40) %>% 
  top_n(10,wt=proportion_reordered) %>% 
  arrange(desc(proportion_reordered)) %>% 
  left_join(products,by="product_id")

top10_reordered_products %>% 
  ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
  geom_bar(stat="identity",fill="red")+theme_wsj()+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))

Which item do people put into the cart first?

first_item_in_baskset <- order_products %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% mutate(pct=count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by="product_id") %>% 
  select(product_name, pct, count) %>% 
  ungroup() %>% 
  top_n(10, wt=pct)
## Adding missing grouping variables: `product_id`
first_item_in_baskset %>% 
  ggplot(aes(x=reorder(product_name,-pct), y=pct))+
  geom_bar(stat="identity",fill="red")+theme_economist()+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))

Association between time of last order and probability of reorder

  order_products %>% 
  left_join(orders,by="order_id") %>% 
  group_by(days_since_prior_order) %>%
  summarize(mean_reorder = mean(reordered)) %>%
  ggplot(aes(x=days_since_prior_order,y=mean_reorder))+theme_wsj()+
  geom_bar(stat="identity",fill="red")

Organic vs Non-organic

fancy_scientific <- function(l) {
     # turn in to character string in scientific notation
     l <- format(l, digits=9, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3,scientific = FALSE)
}

products <- products %>% 
mutate(organic=ifelse(str_detect(str_to_lower(products$product_name),'organic'),"organic","not organic"), organic= as.factor(organic))

Organic_vs_nonOrganic <- order_products %>% 
  left_join(products, by="product_id") %>% 
  group_by(organic) %>% 
  summarize(count = n()) %>% 
  mutate(proportion = count/sum(count))

  
  
Organic_vs_nonOrganic  %>% 
  ggplot(aes(x=organic,y=count, fill=organic))+theme_wsj(base_size = 7.5)+
  geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+guides(fill=guide_legend(title=NULL))+ggtitle('Number of products:Organic vs Non-organic') +scale_y_continuous(labels=fancy_scientific) 
## Warning in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, :
## 'big.mark' and 'decimal.mark' are both ',', which could be confusing

Reordering Organic vs Non-Organic

reorder_Organic_vs_nonOrganic<- order_products %>% left_join(products,by="product_id") %>% group_by(organic) %>% summarize(mean_reordered = mean(reordered))

reorder_Organic_vs_nonOrganic %>% ggplot(aes(x=organic,fill=organic,y=mean_reordered))+geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+theme_wsj(base_size = 7)+ggtitle('Reorder Percentage: Organic vs Non-Organic')

How are aisles organized within departments?

tmp <- products %>% group_by (department_id,aisle_id) %>% summarize(n=n())
tmp <- tmp %>% left_join(departments,by="department_id")
tmp <- tmp %>% left_join(aisles,by="aisle_id")

tmp2<-order_products %>% 
  group_by(product_id) %>% 
  summarize(count=n()) %>% 
  left_join(products,by="product_id") %>% 
  ungroup() %>% 
  group_by(department_id,aisle_id) %>% 
  summarize(numofsales = sum(count)) %>% 
  left_join(tmp, by = c("department_id", "aisle_id")) %>% 
  mutate(onesize = 1)
  


treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",
        title="",sortID="numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels ="#FFFFFF")

----------------
products %>% group_by (aisle_id,department_id) %>% summarize(n=n()) 
## Warning in Ops.factor(left): '-' not meaningful for factors

## Warning in Ops.factor(left): '-' not meaningful for factors
## Source: local data frame [134 x 3]
## Groups: aisle_id [?]
## 
##    aisle_id department_id     n
##       <int>         <int> <int>
## 1         1            20   146
## 2         2            16   271
## 3         3            19   832
## 4         4             9   543
## 5         5            13   409
## 6         6             2   548
## 7         7            12   100
## 8         8             3   297
## 9         9             9   399
## 10       10            17   218
## # ... with 124 more rows

How often are products from the department/aisle sold?

The size of the boxes shows the number of sales.
treemap(tmp2,index=c("department","aisle"),vSize="numofsales",title="",palette="Set1",border.col="#FFFFFF")

How are aisles organized within departments?

treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",title="",sortID="-numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels = "#FFFFFF")

Which hour in each day has the most orders

orders <- fread('C:/Users/User/Desktop/KAGGLE/orders.csv')
## 
Read 67.5% of 3421083 rows
Read 3421083 rows and 7 (of 7) columns from 0.101 GB file in 00:00:03
p0 <- ggplot(orders[order_dow == 0, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 0")

p1 <- ggplot(orders[order_dow == 1, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 1")

p2 <- ggplot(orders[order_dow == 2, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 2")

p3 <- ggplot(orders[order_dow == 3, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 3")

p4 <- ggplot(orders[order_dow == 4, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 4")

p5 <- ggplot(orders[order_dow == 5, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 5")

p6 <- ggplot(orders[order_dow == 6, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 6",
       x = "Hour of the Day")



grid.arrange(p0, p1, p2, p3, p4, p5, p6, ncol = 1)