Data Import

library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(readxl)
dt <- read_excel("~/VScode_project/R_projects/Data/orders_data.xlsx")
str(dt)
## tibble [171 x 12] (S3: tbl_df/tbl/data.frame)
##  $ order_no    : chr [1:171] "405-9763961-5211537" "404-3964908-7850720" "171-8103182-4289117" "405-3171677-9557154" ...
##  $ order_date  : chr [1:171] "Sun, 18 Jul, 2021, 10:38 pm IST" "Tue, 19 Oct, 2021, 6:05 pm IST" "Sun, 28 Nov, 2021, 10:20 pm IST" "Wed, 28 Jul, 2021, 4:06 am IST" ...
##  $ buyer       : chr [1:171] "Mr." "Minam" "yatipertin" "aciya" ...
##  $ ship_city   : chr [1:171] "CHANDIGARH," "PASIGHAT," "PASIGHAT," "DEVARAKONDA," ...
##  $ ship_state  : chr [1:171] "CHANDIGARH" "ARUNACHAL PRADESH" "ARUNACHAL PRADESH" "TELANGANA" ...
##  $ sku         : chr [1:171] "SKU:  2X-3C0F-KNJE" "SKU:  DN-0WDX-VYOT" "SKU:  DN-0WDX-VYOT" "SKU:  AH-J3AO-R7DN" ...
##  $ description : chr [1:171] "100% Leather Elephant Shaped Piggy Coin Bank | Block Printed West Bengal Handicrafts (Shantiniketan Art) | Mone"| __truncated__ "Women's Set of 5 Multicolor Pure Leather Single Lipstick Cases with Mirror, Handy and Compact Handcrafted Shant"| __truncated__ "Women's Set of 5 Multicolor Pure Leather Single Lipstick Cases with Mirror, Handy and Compact Handcrafted Shant"| __truncated__ "Pure 100% Leather Block Print Rectangular Jewelry Box with Mirror | Button Closure Multiple Utility Case (Shant"| __truncated__ ...
##  $ quantity    : chr [1:171] "1" "1" "1" "1" ...
##  $ item_total  : chr [1:171] "<U+20B9>449.00" "<U+20B9>449.00" "<U+20B9>449.00" NA ...
##  $ shipping_fee: chr [1:171] NA "<U+20B9>60.18" "<U+20B9>60.18" NA ...
##  $ cod         : chr [1:171] NA NA NA "Cash On Delivery" ...
##  $ order_status: chr [1:171] "Delivered to buyer" "Delivered to buyer" "Delivered to buyer" "Delivered to buyer" ...

Data Wrangling

library(stringr)
# Remove ₹
dt$item_total <- gsub("\\₹","",dt$item_total)
dt$shipping_fee  <- gsub("\\₹","",dt$shipping_fee )

# Change to numeric
dt[,c(8:10)] <- lapply(dt[,c(8:10)],as.numeric)

# Change datetime format
a <- str_sub(dt$order_date, 6, -5)
library(lubridate)
dt$order_date <- parse_date_time(a, '%d%m%y, %I:%M %p')

# Change city,state into uppercase
dt[,c("ship_city","ship_state")] <- lapply(dt[,c("ship_city","ship_state")], tolower)
# Remove comma
dt[,c("ship_city","ship_state")] <- lapply(dt[,c("ship_city","ship_state")], function(x) gsub(",","",x))

Simple EDA

library(skimr)
skim(dt)
Data summary
Name dt
Number of rows 171
Number of columns 12
_______________________
Column type frequency:
character 8
numeric 3
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
order_no 0 1.00 19 19 0 171 0
buyer 0 1.00 1 15 0 149 0
ship_city 0 1.00 4 19 0 74 0
ship_state 0 1.00 3 17 0 26 0
sku 0 1.00 18 18 0 54 0
description 0 1.00 61 185 0 61 0
cod 124 0.27 16 16 0 1 0
order_status 0 1.00 18 18 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
quantity 0 1 1.09 0.45 1 1 1 1 4 ▇▁▁▁▁
item_total 171 0 NaN NA NA NA NA NA NA
shipping_fee 171 0 NaN NA NA NA NA NA NA

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
order_date 0 1 2021-06-13 19:08:00 2022-02-25 20:44:00 2021-11-13 19:37:00 171
library(DataExplorer)
# create_report(dt)

Handling Missing Data

library(dplyr)
# Impute missing data with mode
## item_total, If price existing, use the mode of other order price
## shipping_fee, use mode of all shipping fee
## cod, if na, online payment
dt$descrip <- str_trim(substring(dt$description,0,40))
Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}
dt <- dt %>% group_by(descrip) %>% 
        mutate(item_total = ifelse(is.na(item_total), Mode(item_total), item_total))
dt <- dt %>% mutate(item_total = ifelse(is.na(item_total), Mode(dt$item_total), item_total))
dt <- dt %>% mutate(shipping_fee = ifelse(is.na(shipping_fee), Mode(dt$shipping_fee), shipping_fee))
dt <- dt %>% mutate(cod = if_else(is.na(cod), "Online",  cod))

Visualization

library(ggpubr)
dt$date <- as.Date(format(dt$order_date, "%Y-%m-%d"))
ggplot(data=dt, aes(x=date,y=item_total)) +  
  stat_summary(fun.y=sum, geom="point") + 
  stat_summary(fun.y=sum, geom="line") +
  stat_summary(fun.y=sum, geom="area", fill="lightblue", alpha=0.5) +
  scale_x_date(date_labels = "%b-%Y",date_breaks = "1 month") +
  labs(x="Date",y="Total Sales (₹)") +
  theme_classic2()

library(lubridate)
dt$day <- lubridate::wday(dt$date,label = TRUE, abbr = FALSE)
a <- ggplot(data=subset(dt,!(dt$day %in% c("Saturday","Sunday"))), aes(x=day,y=item_total)) +   
  geom_bar(stat="sum",na.rm=TRUE, fill="#E69F00",alpha=0.5) +
  theme_classic2() +
  theme(legend.position = "none") + labs(x="",y="Total Sales (₹)")
b <- ggplot(data=subset(dt,(dt$day %in% c("Saturday","Sunday"))), aes(x=day,y=item_total)) +   
  geom_bar(stat="sum",na.rm=TRUE, fill="#E69F00",alpha=0.5) +
  scale_x_discrete(limits=c("Saturday","Sunday")) + 
  theme_classic2() +
  theme(legend.position = "none")+ labs(x="",y="")
ggarrange(a,b,col=2,row=1, widths =c(2,1))

dt %>% 
    group_by(sku) %>% 
    summarise(count = n()) %>% 
  slice_max(count, n=10) %>%
    ggplot(aes(x = reorder(sku,(-count)), y = count)) + 
    geom_bar(stat = 'identity',fill="grey")  + geom_text(aes(label=count)) +
  theme_classic2()+ 
  theme(axis.text.x=element_text(angle=45,hjust=1)) + labs(x="",y="Quantity Sales")

df <- dt %>% 
    group_by(ship_state) %>% 
  count() %>% 
  ungroup() %>% 
  mutate(perc = `n` / sum(`n`)) %>% 
  arrange(perc) %>%
  mutate(labels = scales::percent(perc)) %>% 
  slice_max(labels, n=7)

ggplot(df, aes(x = "", y = perc, fill = ship_state)) +
  geom_col(color = "black") +
  geom_label(aes(label = labels),
            position = position_stack(vjust = 0.5),
            show.legend = FALSE) +
  guides(fill = guide_legend(title = "States")) +
  scale_fill_viridis_d() +
  coord_polar(theta = "y") + 
  theme_void()

return_items <- unique(dt[which(dt$order_status=="Returned to seller"),]$sku)
return_dt <- dt %>% filter(sku %in% return_items) %>% group_by(sku, order_status) %>% 
                    count() %>% 
                    ungroup(order_status) %>% 
                    mutate(perc = `n` / sum(`n`)*100)
ggplot(data=return_dt,aes(x=sku,y=perc, fill=order_status)) + geom_bar(position="stack", stat="identity")+
   coord_flip() + labs(y="") + theme_pubr()