orders <- orders %>% mutate(order_hour_of_day = as.numeric(order_hour_of_day), eval_set = as.factor(eval_set))
products <- products %>% mutate(product_name = as.factor(product_name))
aisles <- aisles %>% mutate(aisle = as.factor(aisle))
departments <- departments %>% mutate(department = as.factor(department))
*When do people order? (HOUR)
orders %>%
ggplot(aes(x=order_hour_of_day)) +
geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
NotFancy <- function(l) {
l <- format(l, scientific = TRUE)
parse(text=l)
}
orders %>%
ggplot(aes(x=order_dow)) +
geom_line(stat="count",fill="red")+theme_economist()+xlab("DAY OF WEEK")+
scale_y_continuous(labels=NotFancy) +ylab("Total Order Counts")
## Warning: Ignoring unknown parameters: fill
orders %>%
ggplot(aes(x=days_since_prior_order)) +
geom_histogram(stat="count",fill="red")+theme_economist()
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 206209 rows containing non-finite values (stat_count).
orders %>% filter(eval_set=="prior") %>% count(order_number) %>% ggplot(aes(order_number,n)) + geom_line(color="red", size=1)+geom_point(size=2, color="red")+theme_economist()
order_products %>%
group_by(order_id) %>%
summarize(n_items = last(add_to_cart_order)) %>%
ggplot(aes(x=n_items))+
geom_histogram(stat="count",fill="red")+theme_economist()+
geom_rug()+
coord_cartesian(xlim=c(0,80))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
top10_ordered_products <- order_products %>%
group_by(product_id) %>%
summarize(count = n()) %>%
top_n(10, wt = count) %>%
left_join(select(products,product_id,product_name),by="product_id") %>%
arrange(desc(count))
top10_ordered_products %>%
ggplot(aes(x=reorder(product_name,-count), y=count))+
geom_bar(stat="identity",fill="blue")+theme_wsj()+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())
top10_reordered_products <-order_products %>%
group_by(product_id) %>%
summarize(proportion_reordered = mean(reordered), n=n()) %>%
filter(n>40) %>%
top_n(10,wt=proportion_reordered) %>%
arrange(desc(proportion_reordered)) %>%
left_join(products,by="product_id")
top10_reordered_products %>%
ggplot(aes(x=reorder(product_name,-proportion_reordered), y=proportion_reordered))+
geom_bar(stat="identity",fill="red")+theme_wsj()+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.85,0.95))
first_item_in_baskset <- order_products %>%
group_by(product_id, add_to_cart_order) %>%
summarize(count = n()) %>% mutate(pct=count/sum(count)) %>%
filter(add_to_cart_order == 1, count>10) %>%
arrange(desc(pct)) %>%
left_join(products,by="product_id") %>%
select(product_name, pct, count) %>%
ungroup() %>%
top_n(10, wt=pct)
## Adding missing grouping variables: `product_id`
first_item_in_baskset %>%
ggplot(aes(x=reorder(product_name,-pct), y=pct))+
geom_bar(stat="identity",fill="red")+theme_economist()+
theme(axis.text.x=element_text(angle=90, hjust=1),axis.title.x = element_blank())+coord_cartesian(ylim=c(0.4,0.7))
order_products %>%
left_join(orders,by="order_id") %>%
group_by(days_since_prior_order) %>%
summarize(mean_reorder = mean(reordered)) %>%
ggplot(aes(x=days_since_prior_order,y=mean_reorder))+theme_wsj()+
geom_bar(stat="identity",fill="red")
fancy_scientific <- function(l) {
# turn in to character string in scientific notation
l <- format(l, digits=9, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3,scientific = FALSE)
}
products <- products %>%
mutate(organic=ifelse(str_detect(str_to_lower(products$product_name),'organic'),"organic","not organic"), organic= as.factor(organic))
Organic_vs_nonOrganic <- order_products %>%
left_join(products, by="product_id") %>%
group_by(organic) %>%
summarize(count = n()) %>%
mutate(proportion = count/sum(count))
Organic_vs_nonOrganic %>%
ggplot(aes(x=organic,y=count, fill=organic))+theme_wsj(base_size = 7.5)+
geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+guides(fill=guide_legend(title=NULL))+ggtitle('Number of products:Organic vs Non-organic') +scale_y_continuous(labels=fancy_scientific)
## Warning in prettyNum(.Internal(format(x, trim, digits, nsmall, width, 3L, :
## 'big.mark' and 'decimal.mark' are both ',', which could be confusing
reorder_Organic_vs_nonOrganic<- order_products %>% left_join(products,by="product_id") %>% group_by(organic) %>% summarize(mean_reordered = mean(reordered))
reorder_Organic_vs_nonOrganic %>% ggplot(aes(x=organic,fill=organic,y=mean_reordered))+geom_bar(stat="identity")+guides(fill=guide_legend(title=NULL))+theme_wsj(base_size = 7)+ggtitle('Reorder Percentage: Organic vs Non-Organic')
tmp <- products %>% group_by (department_id,aisle_id) %>% summarize(n=n())
tmp <- tmp %>% left_join(departments,by="department_id")
tmp <- tmp %>% left_join(aisles,by="aisle_id")
tmp2<-order_products %>%
group_by(product_id) %>%
summarize(count=n()) %>%
left_join(products,by="product_id") %>%
ungroup() %>%
group_by(department_id,aisle_id) %>%
summarize(numofsales = sum(count)) %>%
left_join(tmp, by = c("department_id", "aisle_id")) %>%
mutate(onesize = 1)
treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",
title="",sortID="numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels ="#FFFFFF")
----------------
products %>% group_by (aisle_id,department_id) %>% summarize(n=n())
## Warning in Ops.factor(left): '-' not meaningful for factors
## Warning in Ops.factor(left): '-' not meaningful for factors
## Source: local data frame [134 x 3]
## Groups: aisle_id [?]
##
## aisle_id department_id n
## <int> <int> <int>
## 1 1 20 146
## 2 2 16 271
## 3 3 19 832
## 4 4 9 543
## 5 5 13 409
## 6 6 2 548
## 7 7 12 100
## 8 8 3 297
## 9 9 9 399
## 10 10 17 218
## # ... with 124 more rows
treemap(tmp2,index=c("department","aisle"),vSize="numofsales",title="",palette="Set1",border.col="#FFFFFF")
treemap(tmp2,index=c("department","aisle"),vSize="onesize",vColor="department",palette="Set3",title="",sortID="-numofsales", border.col="#FFFFFF",type="categorical", fontsize.legend = 0,bg.labels = "#FFFFFF")
orders <- fread('C:/Users/dannyhuang/Desktop/KAGGLE/orders.csv')
p0 <- ggplot(orders[order_dow == 0, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 0")
p1 <- ggplot(orders[order_dow == 1, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 1")
p2 <- ggplot(orders[order_dow == 2, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 2")
p3 <- ggplot(orders[order_dow == 3, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 3")
p4 <- ggplot(orders[order_dow == 4, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 4")
p5 <- ggplot(orders[order_dow == 5, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 10), "gold", rep("grey25", 13))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 5")
p6 <- ggplot(orders[order_dow == 6, ], aes(x = order_hour_of_day)) +
geom_bar(fill = c(rep("grey25", 14), "gold", rep("grey25", 9))) +
theme_minimal() +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "none",
panel.grid.major = element_blank()) +
labs(y = "Day 6",
x = "Hour of the Day")
grid.arrange(p0, p1, p2, p3, p4, p5, p6, ncol = 1)
#library(readr)
#library(plyr)
#ordr_pr <- fread("C:/Users/dannyhuang/Desktop/KAGGLE/order_products__prior.csv")
#prods <- read_csv("C:/Users/dannyhuang/Desktop/KAGGLE//products.csv")
#order_baskets <- ordr_pr %>%
# inner_join(prods, by="product_id") %>%
# select(order_id,product_name)
#df_sorted <- order_baskets[order(order_baskets$order_id),]
#df_sorted$order_id <- as.numeric(df_sorted$order_id)
#df_sorted$product_name <- as.factor(df_sorted$product_name)
###df_itemList <- ddply(df_sorted,"order_id",
# function(df1)paste(df1$product_name,
# collapse = ","))
#df_itemList$order_id <- NULL
#write.csv(df_itemList,"ItemList.csv",quote = FALSE, row.names = TRUE)
txn = read.transactions(file="ItemList.csv", rm.duplicates= TRUE, format="basket",sep=",",cols=1,quote = "")
## distribution of transactions with duplicates:
## items
## 1 2 3 4 5 6 7
## 10868 3303 504 65 43 25 2
itemFrequencyPlot(txn, topN = 30)
rules1 <- apriori(txn, parameter = list(supp = 0.005, conf = 0.1, maxlen=3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport support minlen maxlen
## 0.1 0.1 1 none FALSE TRUE 0.005 1 3
## target ext
## rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 16074
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[50883 item(s), 3214875 transaction(s)] done [9.19s].
## sorting and recoding items ... [264 item(s)] done [0.30s].
## creating transaction tree ... done [3.83s].
## checking subsets of size 1 2 3 done [0.36s].
## writing ... [109 rule(s)] done [0.00s].
## creating S4 object ... done [0.66s].
rules2 <- apriori(txn, parameter = list(supp = 0.005, conf = 0.6, maxlen=3), control = list(verbose = FALSE))
inspect(sort(rules1, by="lift")[1:30])
## lhs rhs support confidence lift
## 1 {Milk,
## Organic} => {Vitamin D} 0.006382519 0.8848974 96.031407
## 2 {YoKids Squeezers Organic Low-Fat Yogurt} => {Strawberry} 0.006100082 1.0000000 85.787191
## 3 {Strawberry} => {YoKids Squeezers Organic Low-Fat Yogurt} 0.006100082 0.5233089 85.787191
## 4 {Organic,
## Vitamin D} => {Milk} 0.006382519 1.0000000 75.080572
## 5 {Vitamin D} => {Milk} 0.008989463 0.9755604 73.245629
## 6 {Milk} => {Vitamin D} 0.008989463 0.6749340 73.245629
## 7 {Organic Red Radish} => {Bunch} 0.008468759 1.0000000 70.656593
## 8 {Bunch} => {Organic Red Radish} 0.008468759 0.5983736 70.656593
## 9 {Bag} => {Clementines} 0.012405459 0.8663785 39.417204
## 10 {Clementines} => {Bag} 0.012405459 0.5644052 39.417204
## 11 {Milk,
## Vitamin D} => {Organic} 0.006382519 0.7100000 31.393968
## 12 {Vitamin D} => {Organic} 0.006382519 0.6926479 30.626711
## 13 {Organic} => {Vitamin D} 0.006382519 0.2822149 30.626711
## 14 {Milk} => {Organic} 0.007212722 0.5415353 23.944989
## 15 {Organic} => {Milk} 0.007212722 0.3189239 23.944989
## 16 {Limes} => {Organic Cilantro} 0.005463665 0.1249049 5.775755
## 17 {Organic Cilantro} => {Limes} 0.005463665 0.2526466 5.775755
## 18 {Organic Garlic} => {Organic Yellow Onion} 0.006865897 0.2010694 5.698985
## 19 {Organic Yellow Onion} => {Organic Garlic} 0.006865897 0.1946026 5.698985
## 20 {Limes} => {Large Lemon} 0.008523815 0.1948630 4.103711
## 21 {Large Lemon} => {Limes} 0.008523815 0.1795070 4.103711
## 22 {Organic Lemon} => {Organic Hass Avocado} 0.006609277 0.2421149 3.644323
## 23 {Organic Cucumber} => {Organic Hass Avocado} 0.005429760 0.2171360 3.268340
## 24 {Organic Raspberries} => {Organic Strawberries} 0.010533224 0.2470724 3.000974
## 25 {Organic Strawberries} => {Organic Raspberries} 0.010533224 0.1279379 3.000974
## 26 {Large Lemon} => {Organic Avocado} 0.007595008 0.1599468 2.908175
## 27 {Organic Avocado} => {Large Lemon} 0.007595008 0.1380935 2.908175
## 28 {Organic Blueberries} => {Organic Strawberries} 0.007389401 0.2374175 2.883705
## 29 {Limes} => {Organic Avocado} 0.006846300 0.1565133 2.845747
## 30 {Organic Avocado} => {Limes} 0.006846300 0.1244804 2.845747
inspect(sort(rules2, by="lift"))
## lhs rhs support confidence lift
## 1 {Milk,
## Organic} => {Vitamin D} 0.006382519 0.8848974 96.03141
## 2 {YoKids Squeezers Organic Low-Fat Yogurt} => {Strawberry} 0.006100082 1.0000000 85.78719
## 3 {Organic,
## Vitamin D} => {Milk} 0.006382519 1.0000000 75.08057
## 4 {Vitamin D} => {Milk} 0.008989463 0.9755604 73.24563
## 5 {Milk} => {Vitamin D} 0.008989463 0.6749340 73.24563
## 6 {Organic Red Radish} => {Bunch} 0.008468759 1.0000000 70.65659
## 7 {Bag} => {Clementines} 0.012405459 0.8663785 39.41720
## 8 {Milk,
## Vitamin D} => {Organic} 0.006382519 0.7100000 31.39397
## 9 {Vitamin D} => {Organic} 0.006382519 0.6926479 30.62671
plot(rules1, method="graph", control=list(type="items"))
plot(rules2, method="graph", control=list(type="items"))
plot(rules1, method="paracoord", control=list(reorder=TRUE))
plot(rules2, method="paracoord", control=list(reorder=TRUE))
item_frequencies <- itemFrequency(txn, type="a")
support <- 0.02
freq_items <- sort(item_frequencies, decreasing = F)
freq_items <- freq_items[freq_items>support*length(txn)]
par(mar=c(2,10,2,2)); options(scipen=5)
barplot(freq_items, horiz=T, las=1, main="Frequent Items", cex.names=.8, xlim=c(0,500000))
mtext(paste("support:",support), padj = .8)
abline(v=support*length(txn), col="red")