Data Import
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readxl)
dt <- read_excel("~/VScode_project/R_projects/Data/orders_data.xlsx")
str(dt)
## tibble [171 x 12] (S3: tbl_df/tbl/data.frame)
## $ order_no : chr [1:171] "405-9763961-5211537" "404-3964908-7850720" "171-8103182-4289117" "405-3171677-9557154" ...
## $ order_date : chr [1:171] "Sun, 18 Jul, 2021, 10:38 pm IST" "Tue, 19 Oct, 2021, 6:05 pm IST" "Sun, 28 Nov, 2021, 10:20 pm IST" "Wed, 28 Jul, 2021, 4:06 am IST" ...
## $ buyer : chr [1:171] "Mr." "Minam" "yatipertin" "aciya" ...
## $ ship_city : chr [1:171] "CHANDIGARH," "PASIGHAT," "PASIGHAT," "DEVARAKONDA," ...
## $ ship_state : chr [1:171] "CHANDIGARH" "ARUNACHAL PRADESH" "ARUNACHAL PRADESH" "TELANGANA" ...
## $ sku : chr [1:171] "SKU: 2X-3C0F-KNJE" "SKU: DN-0WDX-VYOT" "SKU: DN-0WDX-VYOT" "SKU: AH-J3AO-R7DN" ...
## $ description : chr [1:171] "100% Leather Elephant Shaped Piggy Coin Bank | Block Printed West Bengal Handicrafts (Shantiniketan Art) | Mone"| __truncated__ "Women's Set of 5 Multicolor Pure Leather Single Lipstick Cases with Mirror, Handy and Compact Handcrafted Shant"| __truncated__ "Women's Set of 5 Multicolor Pure Leather Single Lipstick Cases with Mirror, Handy and Compact Handcrafted Shant"| __truncated__ "Pure 100% Leather Block Print Rectangular Jewelry Box with Mirror | Button Closure Multiple Utility Case (Shant"| __truncated__ ...
## $ quantity : chr [1:171] "1" "1" "1" "1" ...
## $ item_total : chr [1:171] "<U+20B9>449.00" "<U+20B9>449.00" "<U+20B9>449.00" NA ...
## $ shipping_fee: chr [1:171] NA "<U+20B9>60.18" "<U+20B9>60.18" NA ...
## $ cod : chr [1:171] NA NA NA "Cash On Delivery" ...
## $ order_status: chr [1:171] "Delivered to buyer" "Delivered to buyer" "Delivered to buyer" "Delivered to buyer" ...
Data Wrangling
library(stringr)
# Remove ₹
dt$item_total <- gsub("\\₹","",dt$item_total)
dt$shipping_fee <- gsub("\\₹","",dt$shipping_fee )
# Change to numeric
dt[,c(8:10)] <- lapply(dt[,c(8:10)],as.numeric)
# Change datetime format
a <- str_sub(dt$order_date, 6, -5)
library(lubridate)
dt$order_date <- parse_date_time(a, '%d%m%y, %I:%M %p')
# Change city,state into uppercase
dt[,c("ship_city","ship_state")] <- lapply(dt[,c("ship_city","ship_state")], tolower)
# Remove comma
dt[,c("ship_city","ship_state")] <- lapply(dt[,c("ship_city","ship_state")], function(x) gsub(",","",x))
Simple EDA
library(skimr)
skim(dt)
Data summary
| Name |
dt |
| Number of rows |
171 |
| Number of columns |
12 |
| _______________________ |
|
| Column type frequency: |
|
| character |
8 |
| numeric |
3 |
| POSIXct |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| order_no |
0 |
1.00 |
19 |
19 |
0 |
171 |
0 |
| buyer |
0 |
1.00 |
1 |
15 |
0 |
149 |
0 |
| ship_city |
0 |
1.00 |
4 |
19 |
0 |
74 |
0 |
| ship_state |
0 |
1.00 |
3 |
17 |
0 |
26 |
0 |
| sku |
0 |
1.00 |
18 |
18 |
0 |
54 |
0 |
| description |
0 |
1.00 |
61 |
185 |
0 |
61 |
0 |
| cod |
124 |
0.27 |
16 |
16 |
0 |
1 |
0 |
| order_status |
0 |
1.00 |
18 |
18 |
0 |
2 |
0 |
Variable type: numeric
| quantity |
0 |
1 |
1.09 |
0.45 |
1 |
1 |
1 |
1 |
4 |
▇▁▁▁▁ |
| item_total |
171 |
0 |
NaN |
NA |
NA |
NA |
NA |
NA |
NA |
|
| shipping_fee |
171 |
0 |
NaN |
NA |
NA |
NA |
NA |
NA |
NA |
|
Variable type: POSIXct
| order_date |
0 |
1 |
2021-06-13 19:08:00 |
2022-02-25 20:44:00 |
2021-11-13 19:37:00 |
171 |
library(DataExplorer)
# create_report(dt)
Handling Missing Data
library(dplyr)
# Impute missing data with mode
## item_total, If price existing, use the mode of other order price
## shipping_fee, use mode of all shipping fee
## cod, if na, online payment
dt$descrip <- str_trim(substring(dt$description,0,40))
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
dt <- dt %>% group_by(descrip) %>%
mutate(item_total = ifelse(is.na(item_total), Mode(item_total), item_total))
dt <- dt %>% mutate(item_total = ifelse(is.na(item_total), Mode(dt$item_total), item_total))
dt <- dt %>% mutate(shipping_fee = ifelse(is.na(shipping_fee), Mode(dt$shipping_fee), shipping_fee))
dt <- dt %>% mutate(cod = if_else(is.na(cod), "Online", cod))
Visualization
- Sales Trend
- Which days of the week draw more sales?
- Which products sold the most overall?
- People from which states are ordering the most?
- Which products has been returned?
library(ggpubr)
dt$date <- as.Date(format(dt$order_date, "%Y-%m-%d"))
ggplot(data=dt, aes(x=date,y=item_total)) +
stat_summary(fun.y=sum, geom="point") +
stat_summary(fun.y=sum, geom="line") +
stat_summary(fun.y=sum, geom="area", fill="lightblue", alpha=0.5) +
scale_x_date(date_labels = "%b-%Y",date_breaks = "1 month") +
labs(x="Date",y="Total Sales (₹)") +
theme_classic2()

library(lubridate)
dt$day <- lubridate::wday(dt$date,label = TRUE, abbr = FALSE)
a <- ggplot(data=subset(dt,!(dt$day %in% c("Saturday","Sunday"))), aes(x=day,y=item_total)) +
geom_bar(stat="sum",na.rm=TRUE, fill="#E69F00",alpha=0.5) +
theme_classic2() +
theme(legend.position = "none") + labs(x="",y="Total Sales (₹)")
b <- ggplot(data=subset(dt,(dt$day %in% c("Saturday","Sunday"))), aes(x=day,y=item_total)) +
geom_bar(stat="sum",na.rm=TRUE, fill="#E69F00",alpha=0.5) +
scale_x_discrete(limits=c("Saturday","Sunday")) +
theme_classic2() +
theme(legend.position = "none")+ labs(x="",y="")
ggarrange(a,b,col=2,row=1, widths =c(2,1))

dt %>%
group_by(sku) %>%
summarise(count = n()) %>%
slice_max(count, n=10) %>%
ggplot(aes(x = reorder(sku,(-count)), y = count)) +
geom_bar(stat = 'identity',fill="grey") + geom_text(aes(label=count)) +
theme_classic2()+
theme(axis.text.x=element_text(angle=45,hjust=1)) + labs(x="",y="Quantity Sales")

df <- dt %>%
group_by(ship_state) %>%
count() %>%
ungroup() %>%
mutate(perc = `n` / sum(`n`)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc)) %>%
slice_max(labels, n=7)
ggplot(df, aes(x = "", y = perc, fill = ship_state)) +
geom_col(color = "black") +
geom_label(aes(label = labels),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
guides(fill = guide_legend(title = "States")) +
scale_fill_viridis_d() +
coord_polar(theta = "y") +
theme_void()

return_items <- unique(dt[which(dt$order_status=="Returned to seller"),]$sku)
return_dt <- dt %>% filter(sku %in% return_items) %>% group_by(sku, order_status) %>%
count() %>%
ungroup(order_status) %>%
mutate(perc = `n` / sum(`n`)*100)
ggplot(data=return_dt,aes(x=sku,y=perc, fill=order_status)) + geom_bar(position="stack", stat="identity")+
coord_flip() + labs(y="") + theme_pubr()
