Recode

orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))

*When do people order? (HOUR)

orders %>% 
  ggplot(aes(x=order_hour_of_day)) + 
  geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad

When do people order? (DAY OF WEEK)

NotFancy <- function(l) {
 l <- format(l, scientific = TRUE)
 parse(text=l)
}

orders %>% 
  ggplot(aes(x=order_dow)) + 
  geom_line(stat="count",fill="red")+theme_economist()+xlab("DAY OF WEEK")+
     scale_y_continuous(labels=NotFancy) +ylab("Total Order Counts")
## Warning: Ignoring unknown parameters: fill

When do they order again?

orders %>% 
  ggplot(aes(x=days_since_prior_order)) + 
  geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).

How many prior orders are there?

orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")+theme_economist()

How many items do people buy?

order_products %>% 
  group_by(order_id) %>% 
  summarize(n_items = last(add_to_cart_order)) %>%
  ggplot(aes(x=n_items))+
  geom_histogram(stat="count",fill="red")+theme_economist()+ 
  geom_rug()+
  coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Bestsellers

top10_ordered_products <- order_products %>% 
  group_by(product_id) %>% 
  summarize(count = n()) %>% 
  top_n(10, wt = count) %>%
  left_join(select(products,product_id,product_name),by="product_id") %>%
  arrange(desc(count)) 


top10_ordered_products %>% 
  ggplot(aes(x=reorder(product_name,-count), y=count))+
  geom_bar(stat="identity",fill="blue")+theme_wsj()+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())

Most often reordered

top10_reordered_products <-order_products %>% 
  group_by(product_id) %>% 
  summarize(proportion_reordered = mean(reordered), n=n()) %>% 
  filter(n>40) %>% 
  top_n(10,wt=proportion_reordered) %>% 
  arrange(desc(proportion_reordered)) %>% 
  left_join(products,by="product_id")

top10_reordered_products %>% 
  ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
  geom_bar(stat="identity",fill="red")+theme_wsj()+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))

Which item do people put into the cart first?

first_item_in_baskset <- order_products %>% 
  group_by(product_id, add_to_cart_order) %>% 
  summarize(count = n()) %>% mutate(pct=count/sum(count)) %>% 
  filter(add_to_cart_order == 1, count>10) %>% 
  arrange(desc(pct)) %>% 
  left_join(products,by="product_id") %>% 
  select(product_name, pct, count) %>% 
  ungroup() %>% 
  top_n(10, wt=pct)
## Adding missing grouping variables: `product_id`
first_item_in_baskset %>% 
  ggplot(aes(x=reorder(product_name,-pct), y=pct))+
  geom_bar(stat="identity",fill="red")+theme_economist()+
  theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))

Association between time of last order and probability of reorder

  order_products %>% 
  left_join(orders,by="order_id") %>% 
  group_by(days_since_prior_order) %>%
  summarize(mean_reorder = mean(reordered)) %>%
  ggplot(aes(x=days_since_prior_order,y=mean_reorder))+theme_wsj()+
  geom_bar(stat="identity",fill="red")

Organic vs Non-organic

fancy_scientific <- function(l) {
     # turn in to character string in scientific notation
     l <- format(l, digits=9, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3,scientific = FALSE)
}

products <- products %>% 
mutate(organic=ifelse(str_detect(str_to_lower(products$product_name),'organic'),"organic","not organic"), organic= as.factor(organic))

Organic_vs_nonOrganic <- order_products %>% 
  left_join(products, by="product_id") %>% 
  group_by(organic) %>% 
  summarize(count = n()) %>% 
  mutate(proportion = count/sum(count))

  
  
Organic_vs_nonOrganic  %>% 
  ggplot(aes(x=organic,y=count, fill=organic))+theme_wsj(base_size = 7.5)+
  geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+guides(fill=guide_legend(title=NULL))+ggtitle('Number of products:Organic vs Non-organic') +scale_y_continuous(labels=fancy_scientific) 
## Warning in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, :
## 'big.mark' and 'decimal.mark' are both ',', which could be confusing

Reordering Organic vs Non-Organic

reorder_Organic_vs_nonOrganic<- order_products %>% left_join(products,by="product_id") %>% group_by(organic) %>% summarize(mean_reordered = mean(reordered))

reorder_Organic_vs_nonOrganic %>% ggplot(aes(x=organic,fill=organic,y=mean_reordered))+geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+theme_wsj(base_size = 7)+ggtitle('Reorder Percentage: Organic vs Non-Organic')

How are aisles organized within departments?

tmp <- products %>% group_by (department_id,aisle_id) %>% summarize(n=n())
tmp <- tmp %>% left_join(departments,by="department_id")
tmp <- tmp %>% left_join(aisles,by="aisle_id")

tmp2<-order_products %>% 
  group_by(product_id) %>% 
  summarize(count=n()) %>% 
  left_join(products,by="product_id") %>% 
  ungroup() %>% 
  group_by(department_id,aisle_id) %>% 
  summarize(numofsales = sum(count)) %>% 
  left_join(tmp, by = c("department_id", "aisle_id")) %>% 
  mutate(onesize = 1)
  


treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",
        title="",sortID="numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels ="#FFFFFF")

----------------
products %>% group_by (aisle_id,department_id) %>% summarize(n=n()) 
## Warning in Ops.factor(left): '-' not meaningful for factors

## Warning in Ops.factor(left): '-' not meaningful for factors
## Source: local data frame [134 x 3]
## Groups: aisle_id [?]
## 
##    aisle_id department_id     n
##       <int>         <int> <int>
## 1         1            20   146
## 2         2            16   271
## 3         3            19   832
## 4         4             9   543
## 5         5            13   409
## 6         6             2   548
## 7         7            12   100
## 8         8             3   297
## 9         9             9   399
## 10       10            17   218
## # ... with 124 more rows

How often are products from the department/aisle sold?

The size of the boxes shows the number of sales.
treemap(tmp2,index=c("department","aisle"),vSize="numofsales",title="",palette="Set1",border.col="#FFFFFF")

How are aisles organized within departments?

treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",title="",sortID="-numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels = "#FFFFFF")

Which hour in each day has the most orders

orders <- fread('C:/Users/dannyhuang/Desktop/KAGGLE/orders.csv')
p0 <- ggplot(orders[order_dow == 0, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 0")

p1 <- ggplot(orders[order_dow == 1, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 1")

p2 <- ggplot(orders[order_dow == 2, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 2")

p3 <- ggplot(orders[order_dow == 3, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 3")

p4 <- ggplot(orders[order_dow == 4, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 4")

p5 <- ggplot(orders[order_dow == 5, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 5")

p6 <- ggplot(orders[order_dow == 6, ], aes(x = order_hour_of_day)) +
  geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
  theme_minimal() +
  theme(axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "none",
        panel.grid.major = element_blank()) +
  labs(y = "Day 6",
       x = "Hour of the Day")



grid.arrange(p0, p1, p2, p3, p4, p5, p6, ncol = 1)

Association Rules

#library(readr)
#library(plyr)
#ordr_pr <- fread("C:/Users/dannyhuang/Desktop/KAGGLE/order_products__prior.csv")
#prods <- read_csv("C:/Users/dannyhuang/Desktop/KAGGLE//products.csv")


#order_baskets <- ordr_pr %>% 
#  inner_join(prods, by="product_id") %>%
#  select(order_id,product_name)


#df_sorted <- order_baskets[order(order_baskets$order_id),]
#df_sorted$order_id <- as.numeric(df_sorted$order_id)
#df_sorted$product_name <- as.factor(df_sorted$product_name)


###df_itemList <- ddply(df_sorted,"order_id", 
#                       function(df1)paste(df1$product_name, 
 #                      collapse = ","))


#df_itemList$order_id <- NULL

#write.csv(df_itemList,"ItemList.csv",quote = FALSE, row.names = TRUE)




txn = read.transactions(file="ItemList.csv", rm.duplicates= TRUE, format="basket",sep=",",cols=1,quote = "")
## distribution of transactions with duplicates:
## items
##     1     2     3     4     5     6     7 
## 10868  3303   504    65    43    25     2
itemFrequencyPlot(txn, topN = 30)

rules1 <- apriori(txn, parameter = list(supp = 0.005, conf = 0.1, maxlen=3)) 
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport support minlen maxlen
##         0.1    0.1    1 none FALSE            TRUE   0.005      1      3
##  target   ext
##   rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 16074 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[50883 item(s), 3214875 transaction(s)] done [9.19s].
## sorting and recoding items ... [264 item(s)] done [0.30s].
## creating transaction tree ... done [3.83s].
## checking subsets of size 1 2 3 done [0.36s].
## writing ... [109 rule(s)] done [0.00s].
## creating S4 object  ... done [0.66s].
rules2 <- apriori(txn, parameter = list(supp = 0.005, conf = 0.6, maxlen=3), control = list(verbose = FALSE)) 


inspect(sort(rules1, by="lift")[1:30])
##    lhs                                          rhs                                           support confidence      lift
## 1  {Milk,                                                                                                                 
##     Organic}                                 => {Vitamin D}                               0.006382519  0.8848974 96.031407
## 2  {YoKids Squeezers Organic Low-Fat Yogurt} => {Strawberry}                              0.006100082  1.0000000 85.787191
## 3  {Strawberry}                              => {YoKids Squeezers Organic Low-Fat Yogurt} 0.006100082  0.5233089 85.787191
## 4  {Organic,                                                                                                              
##     Vitamin D}                               => {Milk}                                    0.006382519  1.0000000 75.080572
## 5  {Vitamin D}                               => {Milk}                                    0.008989463  0.9755604 73.245629
## 6  {Milk}                                    => {Vitamin D}                               0.008989463  0.6749340 73.245629
## 7  {Organic Red Radish}                      => {Bunch}                                   0.008468759  1.0000000 70.656593
## 8  {Bunch}                                   => {Organic Red Radish}                      0.008468759  0.5983736 70.656593
## 9  {Bag}                                     => {Clementines}                             0.012405459  0.8663785 39.417204
## 10 {Clementines}                             => {Bag}                                     0.012405459  0.5644052 39.417204
## 11 {Milk,                                                                                                                 
##     Vitamin D}                               => {Organic}                                 0.006382519  0.7100000 31.393968
## 12 {Vitamin D}                               => {Organic}                                 0.006382519  0.6926479 30.626711
## 13 {Organic}                                 => {Vitamin D}                               0.006382519  0.2822149 30.626711
## 14 {Milk}                                    => {Organic}                                 0.007212722  0.5415353 23.944989
## 15 {Organic}                                 => {Milk}                                    0.007212722  0.3189239 23.944989
## 16 {Limes}                                   => {Organic Cilantro}                        0.005463665  0.1249049  5.775755
## 17 {Organic Cilantro}                        => {Limes}                                   0.005463665  0.2526466  5.775755
## 18 {Organic Garlic}                          => {Organic Yellow Onion}                    0.006865897  0.2010694  5.698985
## 19 {Organic Yellow Onion}                    => {Organic Garlic}                          0.006865897  0.1946026  5.698985
## 20 {Limes}                                   => {Large Lemon}                             0.008523815  0.1948630  4.103711
## 21 {Large Lemon}                             => {Limes}                                   0.008523815  0.1795070  4.103711
## 22 {Organic Lemon}                           => {Organic Hass Avocado}                    0.006609277  0.2421149  3.644323
## 23 {Organic Cucumber}                        => {Organic Hass Avocado}                    0.005429760  0.2171360  3.268340
## 24 {Organic Raspberries}                     => {Organic Strawberries}                    0.010533224  0.2470724  3.000974
## 25 {Organic Strawberries}                    => {Organic Raspberries}                     0.010533224  0.1279379  3.000974
## 26 {Large Lemon}                             => {Organic Avocado}                         0.007595008  0.1599468  2.908175
## 27 {Organic Avocado}                         => {Large Lemon}                             0.007595008  0.1380935  2.908175
## 28 {Organic Blueberries}                     => {Organic Strawberries}                    0.007389401  0.2374175  2.883705
## 29 {Limes}                                   => {Organic Avocado}                         0.006846300  0.1565133  2.845747
## 30 {Organic Avocado}                         => {Limes}                                   0.006846300  0.1244804  2.845747
inspect(sort(rules2, by="lift"))
##   lhs                                          rhs               support confidence     lift
## 1 {Milk,                                                                                    
##    Organic}                                 => {Vitamin D}   0.006382519  0.8848974 96.03141
## 2 {YoKids Squeezers Organic Low-Fat Yogurt} => {Strawberry}  0.006100082  1.0000000 85.78719
## 3 {Organic,                                                                                 
##    Vitamin D}                               => {Milk}        0.006382519  1.0000000 75.08057
## 4 {Vitamin D}                               => {Milk}        0.008989463  0.9755604 73.24563
## 5 {Milk}                                    => {Vitamin D}   0.008989463  0.6749340 73.24563
## 6 {Organic Red Radish}                      => {Bunch}       0.008468759  1.0000000 70.65659
## 7 {Bag}                                     => {Clementines} 0.012405459  0.8663785 39.41720
## 8 {Milk,                                                                                    
##    Vitamin D}                               => {Organic}     0.006382519  0.7100000 31.39397
## 9 {Vitamin D}                               => {Organic}     0.006382519  0.6926479 30.62671
plot(rules1, method="graph", control=list(type="items"))

plot(rules2, method="graph", control=list(type="items"))

plot(rules1, method="paracoord", control=list(reorder=TRUE))

plot(rules2, method="paracoord", control=list(reorder=TRUE))

Frequent Items

item_frequencies <- itemFrequency(txn, type="a")
support <- 0.02
freq_items <- sort(item_frequencies, decreasing = F)
freq_items <- freq_items[freq_items>support*length(txn)]

par(mar=c(2,10,2,2)); options(scipen=5)
barplot(freq_items, horiz=T, las=1, main="Frequent Items", cex.names=.8, xlim=c(0,500000))
mtext(paste("support:",support), padj = .8)
abline(v=support*length(txn), col="red")