library(tidyverse)
library(ggplot2)
library(readr)
library(tibble)
order_items <- read_csv("order_items.csv", show_col_types = FALSE)
orders <- read_csv("orders.csv", show_col_types = FALSE)
We chose this dataset because it provides comprehensive e-commerce transaction data, including order details, delivery times, customer demographics, and product attributes. These factors are crucial for understanding order volume trends and identifying delivery delay patterns. With over 100,000 orders, this dataset enables robust time series forecasting and predictive modeling to enhance operational efficiency.
- Optimizing Inventory Management: Predicting order volume helps retailers stock products efficiently, reducing overstock or stockouts.
- Improving Logistics & Delivery Planning: Identifying factors contributing to delivery delays allows for better route planning and carrier selection.
- Enhancing Customer Satisfaction: Predicting delays in advance enables proactive customer communication and improved service.
- Resource Allocation: Forecasting demand helps in workforce planning, ensuring enough staff for order fulfillment and customer support.
order_items <- as_tibble(order_items)
orders <- as_tibble(orders)
glimpse(order_items)
## Rows: 112,650
## Columns: 7
## $ order_id <chr> "00010242fe8c5a6d1ba2dd792cb16214", "00018f77f2f03…
## $ order_item_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1,…
## $ product_id <chr> "4244733e06e7ecb4970a6e2683c13e61", "e5f2d52b80218…
## $ seller_id <chr> "48436dade18ac8b2bce089ec2a041202", "dd7ddc04e1b6c…
## $ shipping_limit_date <dttm> 2017-09-19 09:45:35, 2017-05-03 11:05:13, 2018-01…
## $ price <dbl> 58.90, 239.90, 199.00, 12.99, 199.90, 21.90, 19.90…
## $ freight_value <dbl> 13.29, 19.93, 17.87, 12.79, 18.14, 12.69, 11.85, 7…
glimpse(orders)
## Rows: 99,441
## Columns: 8
## $ order_id <chr> "e481f51cbdc54678b7cc49136f2d6af7", "53c…
## $ customer_id <chr> "9ef432eb6251297304e76186b10a928d", "b08…
## $ order_status <chr> "delivered", "delivered", "delivered", "…
## $ order_purchase_timestamp <dttm> 2017-10-02 10:56:33, 2018-07-24 20:41:3…
## $ order_approved_at <dttm> 2017-10-02 11:07:15, 2018-07-26 03:24:2…
## $ order_delivered_carrier_date <dttm> 2017-10-04 19:55:00, 2018-07-26 14:31:0…
## $ order_delivered_customer_date <dttm> 2017-10-10 21:25:13, 2018-08-07 15:27:4…
## $ order_estimated_delivery_date <dttm> 2017-10-18, 2018-08-13, 2018-09-04, 201…
sum(is.na(order_items))
## [1] 0
sum(is.na(orders))
## [1] 4908
orders[orders$order_status == "delivered",]
## # A tibble: 96,478 × 8
## order_id customer_id order_status order_purchase_times…¹ order_approved_at
## <chr> <chr> <chr> <dttm> <dttm>
## 1 e481f51c… 9ef432eb62… delivered 2017-10-02 10:56:33 2017-10-02 11:07:15
## 2 53cdb2fc… b0830fb474… delivered 2018-07-24 20:41:37 2018-07-26 03:24:27
## 3 47770eb9… 41ce2a54c0… delivered 2018-08-08 08:38:49 2018-08-08 08:55:23
## 4 949d5b44… f88197465e… delivered 2017-11-18 19:28:06 2017-11-18 19:45:59
## 5 ad21c59c… 8ab97904e6… delivered 2018-02-13 21:18:39 2018-02-13 22:20:29
## 6 a4591c26… 503740e9ca… delivered 2017-07-09 21:57:05 2017-07-09 22:10:13
## 7 6514b8ad… 9bdf08b4b3… delivered 2017-05-16 13:10:30 2017-05-16 13:22:11
## 8 76c6e866… f54a9f0e6b… delivered 2017-01-23 18:29:09 2017-01-25 02:50:47
## 9 e69bfb5e… 31ad1d1b63… delivered 2017-07-29 11:55:02 2017-07-29 12:05:32
## 10 e6ce16cb… 494dded5b2… delivered 2017-05-16 19:41:10 2017-05-16 19:50:18
## # ℹ 96,468 more rows
## # ℹ abbreviated name: ¹​order_purchase_timestamp
## # ℹ 3 more variables: order_delivered_carrier_date <dttm>,
## # order_delivered_customer_date <dttm>, order_estimated_delivery_date <dttm>
order_items
## # A tibble: 112,650 × 7
## order_id order_item_id product_id seller_id shipping_limit_date price
## <chr> <dbl> <chr> <chr> <dttm> <dbl>
## 1 00010242fe8c5a6… 1 4244733e0… 48436dad… 2017-09-19 09:45:35 58.9
## 2 00018f77f2f0320… 1 e5f2d52b8… dd7ddc04… 2017-05-03 11:05:13 240.
## 3 000229ec398224e… 1 c777355d1… 5b51032e… 2018-01-18 14:48:30 199
## 4 00024acbcdf0a6d… 1 7634da152… 9d7a1d34… 2018-08-15 10:10:18 13.0
## 5 00042b26cf59d7c… 1 ac6c36230… df560393… 2017-02-13 13:57:51 200.
## 6 00048cc3ae777c6… 1 ef92defde… 6426d21a… 2017-05-23 03:55:27 21.9
## 7 00054e8431b9d76… 1 8d4f2bb7e… 7040e82f… 2017-12-14 12:10:31 19.9
## 8 000576fe3931984… 1 557d85097… 5996cdda… 2018-07-10 12:30:45 810
## 9 0005a1a1728c9d7… 1 310ae3c14… a416b6a8… 2018-03-26 18:31:29 146.
## 10 0005f50442cb953… 1 4535b0e10… ba143b05… 2018-07-06 14:10:56 54.0
## # ℹ 112,640 more rows
## # ℹ 1 more variable: freight_value <dbl>
orders
## # A tibble: 99,441 × 8
## order_id customer_id order_status order_purchase_times…¹ order_approved_at
## <chr> <chr> <chr> <dttm> <dttm>
## 1 e481f51c… 9ef432eb62… delivered 2017-10-02 10:56:33 2017-10-02 11:07:15
## 2 53cdb2fc… b0830fb474… delivered 2018-07-24 20:41:37 2018-07-26 03:24:27
## 3 47770eb9… 41ce2a54c0… delivered 2018-08-08 08:38:49 2018-08-08 08:55:23
## 4 949d5b44… f88197465e… delivered 2017-11-18 19:28:06 2017-11-18 19:45:59
## 5 ad21c59c… 8ab97904e6… delivered 2018-02-13 21:18:39 2018-02-13 22:20:29
## 6 a4591c26… 503740e9ca… delivered 2017-07-09 21:57:05 2017-07-09 22:10:13
## 7 136cce7f… ed0271e0b7… invoiced 2017-04-11 12:22:08 2017-04-13 13:25:17
## 8 6514b8ad… 9bdf08b4b3… delivered 2017-05-16 13:10:30 2017-05-16 13:22:11
## 9 76c6e866… f54a9f0e6b… delivered 2017-01-23 18:29:09 2017-01-25 02:50:47
## 10 e69bfb5e… 31ad1d1b63… delivered 2017-07-29 11:55:02 2017-07-29 12:05:32
## # ℹ 99,431 more rows
## # ℹ abbreviated name: ¹​order_purchase_timestamp
## # ℹ 3 more variables: order_delivered_carrier_date <dttm>,
## # order_delivered_customer_date <dttm>, order_estimated_delivery_date <dttm>
monthly_volume <- orders %>%
filter(order_status == "delivered", !is.na(order_delivered_customer_date)) %>%
mutate(month = floor_date(as.Date(order_delivered_customer_date), "month")) %>%
group_by(month) %>%
summarise(order_count = n())
ggplot(monthly_volume, aes(x = month, y = order_count)) +
geom_line(color = "steelblue") +
geom_point() +
labs(title = "Monthly Order Volume Trend", x = "Month", y = "Number of Delivered Orders") +
theme_minimal()
weekly_volume <- orders %>%
filter(order_status == "delivered", !is.na(order_delivered_customer_date)) %>%
mutate(week = floor_date(as.Date(order_delivered_customer_date), "week")) %>%
group_by(week) %>%
summarise(order_count = n())
ggplot(weekly_volume, aes(x = week, y = order_count)) +
geom_line(color = "darkgreen") +
labs(title = "Weekly Order Volume Trend", x = "Week", y = "Number of Delivered Orders") +
theme_minimal()
weekday_volume <- orders %>%
filter(order_status == "delivered", !is.na(order_delivered_customer_date)) %>%
mutate(weekday = wday(as.Date(order_delivered_customer_date), label = TRUE)) %>%
count(weekday)
ggplot(weekday_volume, aes(x = weekday, y = n)) +
geom_col(fill = "orange") +
labs(title = "Order Volume by Day of Week", x = "Weekday", y = "Orders")
orders_delay <- orders %>%
filter(order_status == "delivered") %>%
filter(!is.na(order_delivered_customer_date), !is.na(order_estimated_delivery_date))
orders_delay <- orders_delay %>%
mutate(
delivered_date = as.Date(order_delivered_customer_date),
estimated_date = as.Date(order_estimated_delivery_date),
delivery_delay = as.numeric(delivered_date - estimated_date)
)
ggplot(orders_delay, aes(x = delivery_delay)) +
geom_histogram(binwidth = 1, fill = "red", color = "white") +
coord_cartesian(xlim = c(-20, 40)) + # focus on -20 to +40 days
labs(
title = "Delivery Delay Distribution",
x = "Days Late (or Early)",
y = "Number of Orders"
) +
theme_minimal()
orders_delay <- orders_delay %>%
mutate(is_late = ifelse(delivery_delay > 0, "Late", "On-Time"))
delay_summary <- orders_delay %>%
count(is_late) %>%
mutate(percent = round(n / sum(n) * 100, 1))
ggplot(delay_summary, aes(x = "", y = percent, fill = is_late)) +
geom_col(width = 1) +
coord_polar("y") +
labs(title = "Delivery Performance: On-Time vs Late")
orders_delay <- orders_delay %>%
mutate(month_day = format(delivered_date, "%m-%d"),
is_holiday = case_when(
month_day %in% c("12-20", "12-21", "12-22", "12-23", "12-24", "12-25", "12-26") ~ "Christmas",
month_day %in% c("02-13", "02-14", "02-15") ~ "Valentine",
month_day %in% c("11-23", "11-24", "11-25") ~ "Thanksgiving",
month_day %in% c("12-31", "01-01") ~ "New Year",
TRUE ~ "Other"
))
holiday_delay <- orders_delay %>%
group_by(is_holiday) %>%
summarise(avg_delay = mean(delivery_delay, na.rm = TRUE))
ggplot(holiday_delay, aes(x = is_holiday, y = avg_delay)) +
geom_col(fill = "darkred") +
labs(title = "Average Delivery Delay During Holiday Windows", x = "Holiday", y = "Avg Delay (Days)")
order_size <- order_items %>%
count(order_id, name = "order_item_count")
delay_order_size <- orders_delay %>%
inner_join(order_size, by = "order_id")
ggplot(delay_order_size, aes(x = order_item_count, y = delivery_delay)) +
geom_boxplot() +
labs(title = "Delivery Delay by Order Size", x = "Items per Order", y = "Delay (Days)")