library(tidyverse) library(lubridate) library(skimr) library(corrplot) library(ggplot2) library(dplyr) file_path <- “C:/Users/Administrator/Desktop/UM sem1/WQD7004/数据集-小组/synthetic_ecommerce_sales_2025.csv” df_raw <- read.csv(file_path, stringsAsFactors = FALSE) glimpse(df_raw) skim(df_raw)
##1.1 Check Missing Values missing_summary <-
colSums(is.na(df_raw)) print(missing_summary) ##1.2 Remove Columns with
>50% Missing high_na <- names(missing_summary[missing_summary >
0.5 * nrow(df_raw)]) if(length(high_na) > 0) { df_clean <- df_raw
%>% select(-one_of(high_na)) } else { df_clean <- df_raw } ##1.3
Convert Data Types ###Convert columns to appropriate types:
###order_date → Date ###is_returned, product_category, region,
payment_method → factor
###quantity, delivery_days → integer ###product_price, discount_percent,
customer_rating → numeric df_clean <- df_clean %>% mutate(
order_date = as.Date(order_date, format = “%Y-%m-%d”), is_returned =
as.factor(is_returned), product_category = as.factor(product_category),
region = as.factor(region), payment_method = as.factor(payment_method),
quantity = as.integer(quantity), product_price =
as.numeric(product_price), discount_percent =
as.numeric(discount_percent), delivery_days = as.integer(delivery_days),
customer_rating = as.numeric(customer_rating) ) ##1.4 Handle Missing
Values (Median for Numeric, Mode for Factors) ### Numeric columns
numeric_cols <- df_clean %>% select(where(is.numeric)) %>%
names() for(col in numeric_cols) { if(any(is.na(df_clean[[col]]))) {
df_clean[[col]][is.na(df_clean[[col]])] <- median(df_clean[[col]],
na.rm = TRUE) } } ### Factor columns factor_cols <- df_clean %>%
select(where(is.factor)) %>% names() for(col in factor_cols) {
if(any(is.na(df_clean[[col]]))) { mode_val <-
names(sort(table(df_clean[[col]]), decreasing = TRUE))[1]
df_clean[[col]][is.na(df_clean[[col]])] <- mode_val } } ##1.5 Flag
Outliers ###Create binary flag columns for each variable based on
business rules (e.g., quantity >100 or ≤0, price >10000 or ≤0,
delivery days >30 or <0, discount >100 or <0, rating >5
or <1). Then create an overall flag is_any_outlier that is 1 if any
field is anomalous. df_clean <- df_clean %>% mutate(
is_quantity_outlier = if_else(quantity > 100 | quantity <= 0, 1,
0), is_price_outlier = if_else(product_price > 10000 | product_price
<= 0, 1, 0), is_delivery_outlier = if_else(delivery_days > 30 |
delivery_days < 0, 1, 0), is_discount_outlier =
if_else(discount_percent > 100 | discount_percent < 0, 1, 0),
is_rating_outlier = if_else(customer_rating > 5 | customer_rating
< 1, 1, 0), ###mark as 1 if any field is anomalous. is_any_outlier =
pmax(is_quantity_outlier, is_price_outlier, is_delivery_outlier,
is_discount_outlier, is_rating_outlier) ) ##1.6 Check Revenue
Consistency ### the dataset contains a revenue column, recompute revenue
as price * quantity * (1 - discount%) and keep only rows where the
calculated revenue matches the original within a tolerance of 0.01.
if(“revenue” %in% names(df_clean)) { df_clean <- df_clean %>%
mutate(revenue_calc = product_price * quantity * (1 -
discount_percent/100)) %>% filter(abs(revenue - revenue_calc) <
0.01) %>% select(-revenue_calc) } ##1.7 Remove Duplicate Rows
df_clean <- df_clean %>% distinct() ##1.8 Save Cleaned Dataset
output_dir <- dirname(file_path) output_path <-
file.path(output_dir, “ecommerce_cleaned.csv”) write.csv(df_clean,
output_path, row.names = FALSE) cat(“Cleaned data saved to:”,
output_path, “”)
print(paste(“Lower bound:”, lower)) print(paste(“Upper bound:”,
upper)) print(paste(“Range of quantity:”, range(df_clean$quantity)))
#2.Exploratory Data Analysis ##2.1 Dataset Summary After Cleaning
cat(“Cleaned dataset dimensions:”, dim(df_clean), “”) summary(df_clean)
##2.2 Univariate Analysis ### Return distribution ggplot(df_clean, aes(x
= is_returned, fill = is_returned)) + geom_bar() + labs(title =
“Distribution of Returns”, x = “Returned”, y = “Count”) +
theme_minimal() ###Returns account for about 6% of orders. The dataset
has class imbalance ##————————————————————————– ### Quantity
distribution ggplot(df_clean, aes(x = quantity)) + geom_histogram(bins =
30, fill = “steelblue”, alpha = 0.7) + labs(title = “Distribution of
Sales Quantity”, x = “Quantity”, y = “Frequency”) + theme_minimal()
###Order quantities are mostly between 2 and 4 items. Very few orders
have 1 or 6 items. The distribution is relatively concentrated
##————————————————————————– ### Price distribution (log scale)
ggplot(df_clean, aes(x = product_price)) + geom_histogram(bins = 40,
fill = “darkgreen”, alpha = 0.7) + scale_x_log10() + labs(title =
“Product Price Distribution (log scale)”, x = “Price (log)”, y =
“Frequency”) + theme_minimal() ###Product prices are right-skewed. Most
products are priced between $50 and $200, with a small number of
high‑price items. ##————————————————————————– ##2.3 Bivariate Analysis -
Classification (is_returned) ###Delivery days vs return ggplot(df_clean,
aes(x = is_returned, y = delivery_days, fill = is_returned)) +
geom_boxplot() + labs(title = “Delivery Days by Return Status”, x =
“Returned”, y = “Delivery Days”) + theme_minimal() ###Returned orders
have longer average delivery days (~6 days) compared to non‑returned
orders (~5 days). Longer delivery times are associated with higher
return risk. ##————————————————————————– ###Discount vs return
proportion df_clean %>% mutate(discount_group = cut(discount_percent,
breaks = c(0,5,15,30,100), labels =
c(“0-5%”,“5-15%”,“15-30%”,“30-100%”))) %>% ggplot(aes(x =
discount_group, fill = is_returned)) + geom_bar(position = “fill”) +
labs(title = “Return Proportion by Discount Level”, x = “Discount
Percent”, y = “Proportion”) + theme_minimal() ###Orders with high
discounts (30%+) show a slightly higher return proportion, but the
difference is not strong. Lower discount groups have lower return rates.
##————————————————————————– ###Customer rating density by return
ggplot(df_clean, aes(x = customer_rating, fill = is_returned)) +
geom_density(alpha = 0.5) + labs(title = “Customer Rating Density by
Return Status”, x = “Rating”, y = “Density”) + theme_minimal()
###Returned orders tend to have lower ratings (concentrated around
2.5–3.5) while non‑returned orders have higher ratings (3.5–4.5). Low
rating is positively correlated with returns.
##————————————————————————– ##2.4 Bivariate Analysis - Regression
(quantity) ### Price vs quantity ggplot(df_clean, aes(x = product_price,
y = quantity)) + geom_point(alpha = 0.2) + geom_smooth(method = “lm”, se
= FALSE, color = “red”) + labs(title = “Product Price vs Sales
Quantity”, x = “Price”, y = “Quantity”) + theme_minimal() ###There is a
weak negative correlation between price and quantity. Higher‑priced
items tend to be purchased in slightly smaller quantities.
##————————————————————————– ### Average quantity by region region_qty
<- df_clean %>% group_by(region) %>% summarise(avg_quantity =
mean(quantity), .groups = “drop”) ggplot(region_qty, aes(x =
reorder(region, avg_quantity), y = avg_quantity, fill = region)) +
geom_col() + coord_flip() + labs(title = “Average Sales Quantity by
Region”, x = “Region”, y = “Avg Quantity”) + theme_minimal() ###North
America and Europe have the highest average sales quantity per order,
while Africa and South America have relatively lower averages. This
suggests region‑specific stock and promotion strategies.
##————————————————————————– ### Discount vs quantity (colored by return)
ggplot(df_clean, aes(x = discount_percent, y = quantity, color =
is_returned)) + geom_point(alpha = 0.5) + geom_smooth(se = FALSE, method
= “lm”) + labs(title = “Discount vs Quantity (colored by Return)”, x =
“Discount (%)”, y = “Quantity”) + theme_minimal() ###No clear linear
relationship between discount and quantity. Returned orders are
scattered across all discount levels without a clear pattern.
##————————————————————————– ##2.5 Correlation Matrix numeric_vars <-
df_clean %>% select(where(is.numeric)) cor_matrix <-
cor(numeric_vars, use = “complete.obs”) corrplot(cor_matrix, method =
“number”, type = “upper”, tl.cex = 0.8, number.cex = 0.7) ##Revenue is
highly correlated with quantity and price (by definition). Discount
shows weak correlation with quantity; delivery days and return also show
weak correlation (around 0.1–0.2). ##————————————————————————– ## 2.6
Time Trend Analysis df_clean %>% group_by(month =
floor_date(order_date, “month”)) %>% summarise( total_quantity =
sum(quantity), return_rate = mean(as.numeric(is_returned) - 1, na.rm =
TRUE), .groups = “drop” ) %>% pivot_longer(cols = c(total_quantity,
return_rate), names_to = “metric”, values_to = “value”) %>%
ggplot(aes(x = month, y = value)) + geom_line(color = “steelblue”) +
facet_wrap(~metric, scales = “free_y”) + labs(title = “Monthly Trends:
Total Quantity and Return Rate”, x = “Month”, y = ““) + theme_minimal()
##Total sales quantity peaks in November and December (holiday season).
The return rate remains stable around 5–7% throughout the year, with no
significant increase during peak months.
##————————————————————————-
##2.7 Category-Level Insights category_return <- df_clean %>%
group_by(product_category) %>% summarise( return_rate =
mean(as.integer(is_returned) - 1, na.rm = TRUE), total_sales =
sum(quantity * product_price * (1 - discount_percent/100)) ) %>%
arrange(desc(return_rate))
print(category_return)
ggplot(category_return, aes(x = reorder(product_category, return_rate), y = return_rate, fill = product_category)) + geom_col() + coord_flip() + labs(title = “Return Rate by Product Category”, x = “Category”, y = “Return Rate”) + theme_minimal() ##Beauty and Fashion have the highest return rates (approximately 8–10%). Electronics and Automotive have the lowest return rates (approximately 3–5%). It is recommended to improve product descriptions or quality for high‑return categories.
#3. Key Findings cat(“========== EDA SUMMARY ==========”) cat(“1. Overall return rate:”, mean(as.numeric(df_clean\(is_returned) - 1, na.rm = TRUE) * 100, "%\n") cat("2. Average quantity per order:", mean(df_clean\)quantity), “”) cat(“3. Top 3 high-return categories:”) print(head(category_return %>% arrange(desc(return_rate)), 3)) cat(“. Correlation between delivery_days and return (point-biserial):”, cor(as.numeric(df_clean\(delivery_days), as.numeric(df_clean\)is_returned) - 1), “”) cat(“5. Best region for sales volume:”, region_qty\(region[which.max(region_qty\)avg_quantity)], “”)