#Loading packages and data
# install.packages("treemap")
library(treemap)
library(data.table)
library(dplyr)
library(ggplot2)
library(knitr)
library(stringr)
library(DT)
orders <- fread('C:/Users/Ajay/Documents/orders.csv')
products <- fread('C:/Users/Ajay/Documents/products.csv')
order_products <- fread('C:/Users/Ajay/Documents/order_products__train.csv')
order_products_prior <- fread('C:/Users/Ajay/Documents/order_products__prior.csv')
aisles <- fread('C:/Users/Ajay/Documents/aisles.csv')
departments <- fread('C:/Users/Ajay/Documents/departments.csv')
“Order file” gives a list of all orders we have in the dataset. 1 row per order. For example, we can see that user 1 has 11 orders, 1 of which is in the train set, and 10 of which are prior orders. The orders.csv doesn’t tell us about which products were ordered. This is contained in the order_products.csv
glimpse(orders)
## Observations: 3,421,083
## Variables: 7
## $ order_id <int> 2539329, 2398795, 473747, 2254736, 4315...
## $ user_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, ...
## $ eval_set <chr> "prior", "prior", "prior", "prior", "pr...
## $ order_number <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2...
## $ order_dow <int> 2, 3, 3, 4, 4, 2, 1, 1, 1, 4, 4, 2, 5, ...
## $ order_hour_of_day <int> 8, 7, 12, 7, 15, 7, 9, 14, 16, 8, 8, 11...
## $ days_since_prior_order <dbl> NA, 15, 21, 29, 28, 19, 20, 14, 0, 30, ...
“Products file” contains the names of the products with their corresponding product_id. Furthermore the aisle and deparment are included.
glimpse(products)
## Observations: 49,688
## Variables: 4
## $ product_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ product_name <chr> "Chocolate Sandwich Cookies", "All-Seasons Salt"...
## $ aisle_id <int> 61, 104, 94, 38, 5, 11, 98, 116, 120, 115, 31, 1...
## $ department_id <int> 19, 13, 7, 1, 13, 11, 7, 1, 16, 7, 7, 1, 11, 17,...
“Order_products” file gives us information about which products (product_id) were ordered. It also contains information of the order (add_to_cart_order) in which the products were put into the cart and information of whether this product is a re-order(1) or not(0)
glimpse(order_products)
## Observations: 1,384,617
## Variables: 4
## $ order_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 36, 36, 36, 36, 36, ...
## $ product_id <int> 49302, 11109, 10246, 49683, 43633, 13176, 47...
## $ add_to_cart_order <int> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7,...
## $ reordered <int> 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,...
“order_products_prior file” is structurally the same as the other_products, but consist of only last order of customer
glimpse(order_products_prior)
## Observations: 32,434,489
## Variables: 4
## $ order_id <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,...
## $ product_id <int> 33120, 28985, 9327, 45918, 30035, 17794, 401...
## $ add_to_cart_order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6,...
## $ reordered <int> 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,...
“aisles file” contains aisle ID and name of aisle
glimpse(aisles)
## Observations: 134
## Variables: 2
## $ aisle_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ aisle <chr> "prepared soups salads", "specialty cheeses", "energy...
“departments file” contains department ID and department name
glimpse(departments)
## Observations: 21
## Variables: 2
## $ department_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ department <chr> "frozen", "other", "bakery", "produce", "alcohol...
Answer: Between “9AM to 7 PM” (As we can see in Histogram-Fig-01)
Fig.01.
Answer: In first two days(Sunday+Monday) (As we can see in Histogram-Fig-02)
Fig.02.
Answer: Customers seem to order more often after exactly 1 week or after exactly 1 Month.(As we can see in Histogram-Fig-03)
Fig.03.
Answer:All the customers in records have ordered atleast 4 times, Y-axis=>Number of customers , X-axis= Number of Orders per customer
Fig.04.
Answer:Most often customers include 5 items in a single order (y-axis=> Number of customers, X-axis=> Number of items per order
Fig.05.
Answer: See Fig-06
Fig.06.
Answer: 59% of the times, customers order previously ordered items
Fig.07.
Answer: Fig:08 gives whether top 10 sellling products are included in re-orders or included in new orders
Fig.08.
Answer: Fig:09 gives which products have highest probability of being reordered
Fig.09.
Answer: Fig:10 gives which products people have put into their cart first, Count gives number of times they have been put into cart first
Fig.10.
Answer: This is interesting: We can see that if people order again on the same day, they order the same product more often. Whereas when 30 days have passed, they tend to try out new things in their order.
Fig- 11.
Answer: Products with a high number of orders are naturally more likely to be reordered. However, there seems to be a ceiling effect.
Fig-12.
Answer: In total there are 21 departments containing 134 aisles.Size gives " Number of products in corresponding aisle and department
Fig-13.
Answer: Size = corresponds to number of Sales, Color= gives fractionof times that products were reordered
Fig-14.
Answer: Yes, Please refer to Fig-15, where I have predicted 4 items for each of top-15 customers( who have most re-orders), “Numbers in Yellow” are customer IDs and Dark color implies more number of reorders
Fig-15.
Answer: Let’s Predict for most reordering customers of USER-IDs- 99753, 84627
orders%>% filter(eval_set=="train" & user_id==99753)%>%merge(order_products, by="order_id")%>% merge(products, by="product_id")%>%
select(user_id,product_name)
## user_id product_name
## 1 99753 Organic Whole Milk
## 2 99753 Organic Reduced Fat Milk
orders%>% filter(eval_set=="train" & user_id==84627)%>%merge(order_products, by="order_id")%>% merge(products, by="product_id")%>%
select(user_id,product_name)
## user_id product_name
## 1 84627 Unsweetened Original Almond Breeze Almond Milk
# Change Char to Factors in all data frames
orders= orders%>% mutate(eval_set=as.factor(eval_set))
products=products%>% mutate(product_name = as.factor(product_name))
aisles = aisles%>% mutate(aisle= as.factor(aisle))
departments = departments%>% mutate(department= as.factor(department))
# Histogram of Order_hour_of_day
# at which hours of the day people order most
orders %>% ggplot(aes(x= order_hour_of_day))+
geom_histogram(binwidth=1, color="red", fill= "blue")+
labs(x="Order Hour of the day",y= "Number of orders",
title="Histogram of order_hour_of_day [Orders dataset] " )+
theme_bw()+theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# At which day of week, people order most
# Histogram of order_dow
orders %>% ggplot(aes(x= order_dow))+
geom_histogram(binwidth= 1, color="red",fill="purple" )+
labs(x= " Order day of the week", y= "Number of orders",
title=" Histogram of order_dow [orders dataset]")+
theme_bw()+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# After how many days people order
# Histogram of days_since prior order
orders %>% ggplot(aes(x= days_since_prior_order))+
geom_histogram(binwidth = 1, color="red", fill= " yellow")+
labs(x=" Days since prior order ", y= "Number of orders",
title="Histogram of days since prior order [orders dataset]")+
theme_bw()+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# ********************************************************************************
# How many times 1st order, 2nd order..etc occurred
orders%>% select(order_number)%>%
group_by(order_number)%>%
summarise(Count2 = n())%>%
ggplot(aes(x= order_number, y= Count2))+
geom_col(color="Red", fill= "yellow")+
labs(x= "Number of order", y = " Number of users with that order")+
theme_bw()+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# Means All the users in records has ordered
# atleast 4 times
# How many items are in 1 order
# for training set
order_products%>% group_by(order_id)%>% summarise(count3= n())%>%
ggplot(aes(x= count3))+ geom_histogram(binwidth = 1, color="red", fill="yellow")+
labs(x= "Number of items per order", y="Count")+ theme_bw()+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# Most number of people include 5 items in an order
# for prior Set
order_products_prior %>% group_by(order_id)%>% summarise(count3= n())%>%
ggplot(aes(x= count3))+ geom_histogram(binwidth = 1, color="red", fill="pink")+
labs(x= "Number of items per order", y="Count")+ theme_bw()
# Most number of people include 5-6 items in an order
# Analyse which products sell most often
# for train data
order_products%>% group_by(product_id)%>%summarise(count4=n())%>%
arrange(desc(count4))%>%top_n(10, wt= count4)%>%
merge(products, by="product_id") %>% select(count4, product_name)%>%
ggplot(aes(x=reorder(product_name, -count4), y= count4, fill =product_name))+geom_col()+
theme(axis.text.x = element_text(angle = 90))+
labs(x= "Most sold products",y="sells", title=" Top 10 selling products")
# for prior data
order_products_prior%>% group_by(product_id)%>%summarise(count4=n())%>%
arrange(desc(count4))%>% top_n(10, wt= count4)%>%
merge(products, by="product_id")%>% select(count4, product_name)%>%
ggplot(aes(x=reorder(product_name, -count4), y= count4,fill =product_name))+geom_col()+
theme(axis.text.x = element_text(angle = 90))+
labs(x= "Most sold products",y="sells", title=" Top 10 selling products")+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# Fraction of orders which are "NEW ORDERS" and which are " REORDERED ORDERS"
# for train data
order_products%>% group_by(reordered)%>% summarise(count5=n())%>%
mutate(Fraction= count5/sum(count5)) %>% ggplot(aes(x=as.factor(reordered), y=Fraction, fill= as.factor(reordered)))+
geom_col()+theme_bw()+
labs(x= "Reordered", y = "Fraction",
title="Fraction of items that are new or reorders")
# for prior data
order_products_prior%>% group_by(reordered)%>% summarise(count5=n())%>%
mutate(Fraction= count5/sum(count5)) %>% ggplot(aes(x=as.factor(reordered), y=Fraction,fill= as.factor(reordered)))+
geom_col()+theme_bw()+
labs(x= "Reordered", y = "Fraction",
title="Fraction of items that are new or reorders")+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# Top 10 most selling products are Reorders or new Orders?
#for prior data
order_products_prior%>% group_by(product_id)%>%summarise(count4=n())%>% arrange(desc(count4))%>%top_n(10, wt=count4)%>%
merge(products, by="product_id")%>%
merge(order_products_prior, by="product_id")%>%
select(product_id, product_name, count4,reordered)%>%
group_by(product_id, reordered)%>% summarise(count6=n())%>%
group_by(product_id)%>% mutate(prop=count6/sum(count6)) %>%
merge(products,by="product_id")%>%
ggplot(aes(x= product_name, y= prop, fill= as.factor(reordered)))+
geom_col()+theme(axis.text.x = element_text(angle = 90))+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# top 10 most reordered products
#for Prior data
order_products_prior%>% group_by(product_id)%>%summarise(fraction_reordered= mean(reordered),count7=n())%>%
filter(count7>50)%>% arrange(desc(fraction_reordered))%>%top_n(10, wt=fraction_reordered)%>% merge(products[, c("product_id", "product_name")], by="product_id")%>%
ggplot(aes(x=reorder(product_name,-fraction_reordered), y=fraction_reordered, fill=product_name))+
geom_col()+theme(axis.text.x = element_text(angle = 90))+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
#for train data
order_products%>% group_by(product_id)%>%summarise(fraction_reordered= mean(reordered),count7=n())%>%
filter(count7>50)%>% arrange(desc(fraction_reordered))%>%top_n(10, wt=fraction_reordered)%>% merge(products[, c("product_id", "product_name")], by="product_id")%>%
ggplot(aes(x=reorder(product_name,-fraction_reordered), y=fraction_reordered, fill=product_name))+
geom_col()+theme(axis.text.x = element_text(angle = 90))
# which products are put first in Market Basket
#for train data
tmp_var=order_products%>% group_by(product_id, add_to_cart_order)%>%summarise(count8=n())%>%
mutate(fraction_nth_order= count8/sum(count8))%>%
merge(products[, c("product_id", "product_name")], by="product_id")%>%
filter(add_to_cart_order==1, count8>10)%>% arrange(desc(fraction_nth_order))%>%
top_n(10, wt=fraction_nth_order)
tmp_var%>%
ggplot(aes(x=reorder(product_name, - fraction_nth_order), y= fraction_nth_order, fill =count8))+
geom_col()+theme(axis.text.x = element_text(angle = 90))+
theme(axis.text = element_text(size = 12), axis.title = element_text(size = 12),
title = element_text(size = 12) )
# Association between time of last order and probability of reorder"
tmp_var2= order_products%>% group_by(order_id)%>%
summarise(frac_reordered= mean(reordered))
tmp_var2%>%
merge(orders[, c("order_id","days_since_prior_order")], by="order_id")%>%
group_by(days_since_prior_order)%>% summarise(frac_reordered2=mean(frac_reordered))%>%ggplot(aes(x=days_since_prior_order, y=frac_reordered2, fill=as.factor(days_since_prior_order)))+
geom_col()+
theme(axis.text = element_text(size = 20), axis.title = element_text(size = 20),
title = element_text(size = 20) )
# Association between number of orders and probability of reordering"
tmp_var3=order_products %>% group_by(product_id)%>%
summarise(count9= n())
tmp_var4= order_products%>% group_by(product_id)%>%
summarise(frac_reordered4=mean(reordered))
tmp_var5= tmp_var3%>%merge(tmp_var4,by="product_id")
tmp_var5%>% filter(count9>20)%>%ggplot(aes(x=count9,y=frac_reordered4))+geom_point()+
coord_cartesian(xlim = c(0,2000))+geom_smooth(color="red")+
theme(axis.text = element_text(size = 20), axis.title = element_text(size = 20),
title = element_text(size = 20) )
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Treemap of departments and aisles
# How products are arranged in Departments and aisles
tmp_v = products%>%select(department_id,aisle_id)%>%
group_by(department_id,aisle_id)%>% summarise(count13=n())
tmp_v=tmp_v%>%merge(aisles, by="aisle_id")%>%merge(departments, by= "department_id")%>%
select(department,aisle, count13)
tmp_v%>%treemap(index = c("department", "aisle"), vSize ="count13",vColor="department",
palette="Set3",border.col="#FFFFFF", bg.labels = "transparent",
fontsize.labels=c(18,16),
fontface.labels=c(2,1),
fontcolor.labels=c("black","black"),
align.labels=list(c("center","center"),c("center","center"))
)
# Visualizing the Product Portfolio
#How are products arrangemed aisles and departments of company
tmp_v2=order_products_prior%>% group_by(product_id)%>%summarise(count4=n())%>%
arrange(desc(count4))%>%
merge(products, by="product_id")%>%
merge(aisles, by="aisle_id")%>%merge(departments, by= "department_id")
tmp_v2%>%group_by(department,aisle)%>%summarise(Total_Sells=sum(count4))
## # A tibble: 134 x 3
## # Groups: department [?]
## department aisle Total_Sells
## <fct> <fct> <int>
## 1 alcohol beers coolers 48657
## 2 alcohol red wines 35181
## 3 alcohol specialty wines champagnes 11198
## 4 alcohol spirits 28102
## 5 alcohol white wines 30558
## 6 babies baby accessories 8160
## 7 babies baby bath body care 8581
## 8 babies baby food formula 382456
## 9 babies diapers wipes 24605
## 10 bakery bakery desserts 34871
## # ... with 124 more rows
# Size gives " Number of products in corresponding aisle and department
tmp_v2%>%treemap(index = c("department", "aisle"), vSize ="count4",vColor="department",
palette="Set3",border.col="#FFFFFF", bg.labels = "transparent",
fontsize.labels=c(20,16),
fontface.labels=c(2,1),
fontcolor.labels=c("black","black"),
align.labels=list(c("center","center"),c("center","center"))
)
#From Which departments and aisles people re-order the most
tmp_v3=order_products_prior%>% group_by(product_id)%>%
summarise(fraction_reordered= mean(reordered),count4=n())
tmp_v3=tmp_v3%>%merge(products, by="product_id")%>%
merge(aisles, by="aisle_id")%>%merge(departments, by= "department_id")
# Size = corresponds to number of orders
# Color= gives fractionof times that product was reordered
tmp_v3%>%treemap(index = c("department", "aisle"), vSize ="count4",vColor="fraction_reordered",
border.col="#FFFFFF", bg.labels = "transparent",type = "index",palette = "Reds",
fontsize.labels=c(20,17),
fontface.labels=c(2,2),
fontcolor.labels=c("yellow","cyan"),
align.labels=list(c("center","center"),c("center","center")))
# USer Profiling
tmp_var2= order_products%>% group_by(order_id)%>%
summarise(frac_reordered= mean(reordered))
tmp_var7= order_products_prior%>% group_by(order_id)%>%
summarise(frac_reordered= mean(reordered))
tmp_var2$order_id[tmp_var2$order_id %in% tmp_var7$order_id]
## integer(0)
merged1= bind_rows(tmp_var2,tmp_var7)
data1 = merged1%>% merge(orders, by="order_id")
data2 = data1%>%group_by(user_id)%>%
summarise(frac_reordered_each_userID=mean(frac_reordered),count11=n())%>%
arrange(desc(frac_reordered_each_userID))
# Top n customers who has most of items reordered
n=15
data3=data2%>% top_n(n, wt=frac_reordered_each_userID)
# what do these top n customers buy
merged3=bind_rows(order_products, order_products_prior)
merged4 = merged3%>%merge(orders,by="order_id")
# tmp_var7 gives products bought by each user_ID
tmp_var7= merged4%>%group_by(user_id, product_id)%>% summarise(count12=n())
# Products bought by top n Customers
tmp_var8 = tmp_var7[tmp_var7$user_id %in% data3$user_id, ]
# top 4 products bought by each of top n customers
tmp_var9= tmp_var8%>%group_by(user_id)%>%
top_n(4,wt=count12 )
Frequent_Reordered_products_by_n_customers=tmp_var9%>%
merge(products[, c("product_id","product_name")], by="product_id")
Frequent_Reordered_products_by_n_customers=
Frequent_Reordered_products_by_n_customers[,c("user_id","product_id","product_name","count12")]
Frequent_Reordered_products_by_n_customers_grouped=
Frequent_Reordered_products_by_n_customers%>% arrange(user_id)
colnames(Frequent_Reordered_products_by_n_customers_grouped)[4]="Count of item by user_id"
kable(Frequent_Reordered_products_by_n_customers_grouped)
user_id | product_id | product_name | Count of item by user_id |
---|---|---|---|
3269 | 27845 | Organic Whole Milk | 75 |
3269 | 35951 | Organic Unsweetened Almond Milk | 45 |
3269 | 36011 | Organic Fat Free Milk | 72 |
3269 | 49235 | Organic Half & Half | 69 |
17997 | 4210 | Whole Milk | 99 |
17997 | 19348 | Fat Free Milk | 74 |
17997 | 27086 | Half & Half | 89 |
17997 | 27730 | Almond Breeze Original Almond Milk | 83 |
26489 | 4210 | Whole Milk | 13 |
26489 | 13984 | Organic Mint | 63 |
26489 | 19677 | Original Pure Creamy Almond Milk | 90 |
26489 | 27086 | Half & Half | 13 |
37075 | 4149 | Frozen Organic Blueberries | 37 |
55331 | 6723 | Palmiers, Petite | 24 |
55331 | 17744 | Dark Chocolate with A Touch Of Sea Salt | 52 |
55331 | 21333 | Original Whipped Cream Cheese | 41 |
55331 | 40028 | Deluxe Cinnamon Raisin Bagels | 51 |
82414 | 4920 | Seedless Red Grapes | 93 |
82414 | 24852 | Banana | 71 |
82414 | 26096 | Organic Gala Apples, Bag | 93 |
82414 | 28199 | Clementines, Bag | 90 |
84627 | 22142 | 100% Pure Pumpkin | 3 |
84627 | 47144 | Unsweetened Original Almond Breeze Almond Milk | 54 |
91160 | 1 | Chocolate Sandwich Cookies | 26 |
91160 | 4210 | Whole Milk | 61 |
91160 | 19511 | Half And Half | 80 |
91160 | 30731 | Pure Cane Granulated White Sugar | 33 |
99753 | 27845 | Organic Whole Milk | 97 |
99753 | 38689 | Organic Reduced Fat Milk | 96 |
107528 | 2820 | Francese Deli Rolls | 52 |
107528 | 10114 | Classic French White Bread | 54 |
110225 | 4149 | Frozen Organic Blueberries | 28 |
110225 | 21137 | Organic Strawberries | 32 |
169980 | 19173 | Orange Calcium & Vitamin D Pulp Free | 13 |
169980 | 21903 | Organic Baby Spinach | 56 |
169980 | 24852 | Banana | 55 |
169980 | 38689 | Organic Reduced Fat Milk | 57 |
170174 | 32478 | Reduced Fat 2% Milk | 48 |
184517 | 13166 | Organic Milk Reduced Fat, 2% Milkfat | 68 |
184517 | 16757 | Barista Blend Pure Almond Milk | 70 |
184517 | 27845 | Organic Whole Milk | 2 |
184517 | 29447 | Milk, Organic, Vitamin D | 69 |
198506 | 9292 | Half And Half Ultra Pasteurized | 51 |
198506 | 19348 | Fat Free Milk | 42 |
198506 | 23909 | 2% Reduced Fat Milk | 3 |
Frequent_Reordered_products_by_n_customers_grouped11 = Frequent_Reordered_products_by_n_customers_grouped%>%mutate(extra=1)
# top 15 customers and recommended items for them
# Color give probability which item customer may buy again?
Frequent_Reordered_products_by_n_customers_grouped11%>%treemap(index = c("user_id","product_name"), vSize = "extra",vColor = "`Count of item by user_id`",
border.col="#FFFFFF", bg.labels = "transparent",type = "index",palette = "Reds",
fontsize.labels=c(18,17),
fontface.labels=c(2,2),
fontcolor.labels=c("yellow","cyan"),
align.labels=list(c("center","center"),c("center","center")))
TESTING OF Model
orders%>% filter(eval_set=="train" & user_id==99753)%>%merge(order_products, by="order_id")%>% merge(products, by="product_id")%>%
select(user_id,product_name)
## user_id product_name
## 1 99753 Organic Whole Milk
## 2 99753 Organic Reduced Fat Milk
orders%>% filter(eval_set=="train" & user_id==84627)%>%merge(order_products, by="order_id")%>% merge(products, by="product_id")%>%
select(user_id,product_name)
## user_id product_name
## 1 84627 Unsweetened Original Almond Breeze Almond Milk
orders%>% filter(eval_set=="train" & user_id==198506)%>%merge(order_products, by="order_id")%>% merge(products, by="product_id")%>%
select(user_id,product_name)
## user_id product_name
## 1 198506 Half And Half Ultra Pasteurized
## 2 198506 Fat Free Milk
CORRECTLY PREDICTED THE NEXT ORDER FOR ALL 3 CUSTOMERS ABOVE
100 % ACCURACY