orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))
*When do people order? (HOUR)
orders %>%
ggplot(aes(x=order_hour_of_day)) +
geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
NotFancy <- function(l) {
l <- format(l, scientific = TRUE)
parse(text=l)
}
orders %>%
ggplot(aes(x=order_dow)) +
geom_line(stat="count",fill="red")+theme_economist()+xlab("DAY OF WEEK")+
scale_y_continuous(labels=NotFancy) +ylab("Total Order Counts")
## Warning: Ignoring unknown parameters: fill
orders %>%
ggplot(aes(x=days_since_prior_order)) +
geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).
orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")+theme_economist()
order_products %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order)) %>%
ggplot(aes(x=n_items))+
geom_histogram(stat="count",fill="red")+theme_economist()+
geom_rug()+
coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
top10_ordered_products <- order_products %>%
group_by(product_id) %>%
summarize(count = n()) %>%
top_n(10, wt = count) %>%
left_join(select(products,product_id,product_name),by="product_id") %>%
arrange(desc(count))
top10_ordered_products %>%
ggplot(aes(x=reorder(product_name,-count), y=count))+
geom_bar(stat="identity",fill="blue")+theme_wsj()+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())
top10_reordered_products <-order_products %>%
group_by(product_id) %>%
summarize(proportion_reordered = mean(reordered), n=n()) %>%
filter(n>40) %>%
top_n(10,wt=proportion_reordered) %>%
arrange(desc(proportion_reordered)) %>%
left_join(products,by="product_id")
top10_reordered_products %>%
ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
geom_bar(stat="identity",fill="red")+theme_wsj()+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))
first_item_in_baskset <- order_products %>%
group_by(product_id, add_to_cart_order) %>%
summarize(count = n()) %>% mutate(pct=count/sum(count)) %>%
filter(add_to_cart_order == 1, count>10) %>%
arrange(desc(pct)) %>%
left_join(products,by="product_id") %>%
select(product_name, pct, count) %>%
ungroup() %>%
top_n(10, wt=pct)
## Adding missing grouping variables: `product_id`
first_item_in_baskset %>%
ggplot(aes(x=reorder(product_name,-pct), y=pct))+
geom_bar(stat="identity",fill="red")+theme_economist()+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))
order_products %>%
left_join(orders,by="order_id") %>%
group_by(days_since_prior_order) %>%
summarize(mean_reorder = mean(reordered)) %>%
ggplot(aes(x=days_since_prior_order,y=mean_reorder))+theme_wsj()+
geom_bar(stat="identity",fill="red")
fancy_scientific <- function(l) {
# turn in to character string in scientific notation
l <- format(l, digits=9, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3,scientific = FALSE)
}
products <- products %>%
mutate(organic=ifelse(str_detect(str_to_lower(products$product_name),'organic'),"organic","not organic"), organic= as.factor(organic))
Organic_vs_nonOrganic <- order_products %>%
left_join(products, by="product_id") %>%
group_by(organic) %>%
summarize(count = n()) %>%
mutate(proportion = count/sum(count))
Organic_vs_nonOrganic %>%
ggplot(aes(x=organic,y=count, fill=organic))+theme_wsj(base_size = 7.5)+
geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+guides(fill=guide_legend(title=NULL))+ggtitle('Number of products:Organic vs Non-organic') +scale_y_continuous(labels=fancy_scientific)
## Warning in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, :
## 'big.mark' and 'decimal.mark' are both ',', which could be confusing
reorder_Organic_vs_nonOrganic<- order_products %>% left_join(products,by="product_id") %>% group_by(organic) %>% summarize(mean_reordered = mean(reordered))
reorder_Organic_vs_nonOrganic %>% ggplot(aes(x=organic,fill=organic,y=mean_reordered))+geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+theme_wsj(base_size = 7)+ggtitle('Reorder Percentage: Organic vs Non-Organic')
tmp <- products %>% group_by (department_id,aisle_id) %>% summarize(n=n())
tmp <- tmp %>% left_join(departments,by="department_id")
tmp <- tmp %>% left_join(aisles,by="aisle_id")
tmp2<-order_products %>%
group_by(product_id) %>%
summarize(count=n()) %>%
left_join(products,by="product_id") %>%
ungroup() %>%
group_by(department_id,aisle_id) %>%
summarize(numofsales = sum(count)) %>%
left_join(tmp, by = c("department_id", "aisle_id")) %>%
mutate(onesize = 1)
treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",
title="",sortID="numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels ="#FFFFFF")
----------------
products %>% group_by (aisle_id,department_id) %>% summarize(n=n())
## Warning in Ops.factor(left): '-' not meaningful for factors
## Warning in Ops.factor(left): '-' not meaningful for factors
## Source: local data frame [134 x 3]
## Groups: aisle_id [?]
##
## aisle_id department_id n
## <int> <int> <int>
## 1 1 20 146
## 2 2 16 271
## 3 3 19 832
## 4 4 9 543
## 5 5 13 409
## 6 6 2 548
## 7 7 12 100
## 8 8 3 297
## 9 9 9 399
## 10 10 17 218
## # ... with 124 more rows
treemap(tmp2,index=c("department","aisle"),vSize="numofsales",title="",palette="Set1",border.col="#FFFFFF")
treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",title="",sortID="-numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels = "#FFFFFF")
orders <- fread('C:/Users/User/Desktop/KAGGLE/orders.csv')
##
Read 67.5% of 3421083 rows
Read 3421083 rows and 7 (of 7) columns from 0.101 GB file in 00:00:03
p0 <- ggplot(orders[order_dow == 0, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 0")
p1 <- ggplot(orders[order_dow == 1, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 1")
p2 <- ggplot(orders[order_dow == 2, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 2")
p3 <- ggplot(orders[order_dow == 3, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 3")
p4 <- ggplot(orders[order_dow == 4, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 4")
p5 <- ggplot(orders[order_dow == 5, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 5")
p6 <- ggplot(orders[order_dow == 6, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 6",
x = "Hour of the Day")
grid.arrange(p0, p1, p2, p3, p4, p5, p6, ncol = 1)