1.Data Cleaning

library(tidyverse) library(lubridate) library(skimr) library(corrplot) library(ggplot2) library(dplyr) file_path <- “C:/Users/Administrator/Desktop/UM sem1/WQD7004/数据集-小组/synthetic_ecommerce_sales_2025.csv” df_raw <- read.csv(file_path, stringsAsFactors = FALSE) glimpse(df_raw) skim(df_raw)

##1.1 Check Missing Values missing_summary <- colSums(is.na(df_raw)) print(missing_summary) ##1.2 Remove Columns with >50% Missing high_na <- names(missing_summary[missing_summary > 0.5 * nrow(df_raw)]) if(length(high_na) > 0) { df_clean <- df_raw %>% select(-one_of(high_na)) } else { df_clean <- df_raw } ##1.3 Convert Data Types ###Convert columns to appropriate types: ###order_date → Date ###is_returned, product_category, region, payment_method → factor
###quantity, delivery_days → integer ###product_price, discount_percent, customer_rating → numeric df_clean <- df_clean %>% mutate( order_date = as.Date(order_date, format = “%Y-%m-%d”), is_returned = as.factor(is_returned), product_category = as.factor(product_category), region = as.factor(region), payment_method = as.factor(payment_method), quantity = as.integer(quantity), product_price = as.numeric(product_price), discount_percent = as.numeric(discount_percent), delivery_days = as.integer(delivery_days), customer_rating = as.numeric(customer_rating) ) ##1.4 Handle Missing Values (Median for Numeric, Mode for Factors) ### Numeric columns numeric_cols <- df_clean %>% select(where(is.numeric)) %>% names() for(col in numeric_cols) { if(any(is.na(df_clean[[col]]))) { df_clean[[col]][is.na(df_clean[[col]])] <- median(df_clean[[col]], na.rm = TRUE) } } ### Factor columns factor_cols <- df_clean %>% select(where(is.factor)) %>% names() for(col in factor_cols) { if(any(is.na(df_clean[[col]]))) { mode_val <- names(sort(table(df_clean[[col]]), decreasing = TRUE))[1] df_clean[[col]][is.na(df_clean[[col]])] <- mode_val } } ##1.5 Flag Outliers ###Create binary flag columns for each variable based on business rules (e.g., quantity >100 or ≤0, price >10000 or ≤0, delivery days >30 or <0, discount >100 or <0, rating >5 or <1). Then create an overall flag is_any_outlier that is 1 if any field is anomalous. df_clean <- df_clean %>% mutate( is_quantity_outlier = if_else(quantity > 100 | quantity <= 0, 1, 0), is_price_outlier = if_else(product_price > 10000 | product_price <= 0, 1, 0), is_delivery_outlier = if_else(delivery_days > 30 | delivery_days < 0, 1, 0), is_discount_outlier = if_else(discount_percent > 100 | discount_percent < 0, 1, 0), is_rating_outlier = if_else(customer_rating > 5 | customer_rating < 1, 1, 0), ###mark as 1 if any field is anomalous. is_any_outlier = pmax(is_quantity_outlier, is_price_outlier, is_delivery_outlier, is_discount_outlier, is_rating_outlier) ) ##1.6 Check Revenue Consistency ### the dataset contains a revenue column, recompute revenue as price * quantity * (1 - discount%) and keep only rows where the calculated revenue matches the original within a tolerance of 0.01. if(“revenue” %in% names(df_clean)) { df_clean <- df_clean %>% mutate(revenue_calc = product_price * quantity * (1 - discount_percent/100)) %>% filter(abs(revenue - revenue_calc) < 0.01) %>% select(-revenue_calc) } ##1.7 Remove Duplicate Rows df_clean <- df_clean %>% distinct() ##1.8 Save Cleaned Dataset output_dir <- dirname(file_path) output_path <- file.path(output_dir, “ecommerce_cleaned.csv”) write.csv(df_clean, output_path, row.names = FALSE) cat(“Cleaned data saved to:”, output_path, “”)

print(paste(“Lower bound:”, lower)) print(paste(“Upper bound:”, upper)) print(paste(“Range of quantity:”, range(df_clean$quantity))) #2.Exploratory Data Analysis ##2.1 Dataset Summary After Cleaning cat(“Cleaned dataset dimensions:”, dim(df_clean), “”) summary(df_clean) ##2.2 Univariate Analysis ### Return distribution ggplot(df_clean, aes(x = is_returned, fill = is_returned)) + geom_bar() + labs(title = “Distribution of Returns”, x = “Returned”, y = “Count”) + theme_minimal() ###Returns account for about 6% of orders. The dataset has class imbalance ##————————————————————————– ### Quantity distribution ggplot(df_clean, aes(x = quantity)) + geom_histogram(bins = 30, fill = “steelblue”, alpha = 0.7) + labs(title = “Distribution of Sales Quantity”, x = “Quantity”, y = “Frequency”) + theme_minimal() ###Order quantities are mostly between 2 and 4 items. Very few orders have 1 or 6 items. The distribution is relatively concentrated ##————————————————————————– ### Price distribution (log scale) ggplot(df_clean, aes(x = product_price)) + geom_histogram(bins = 40, fill = “darkgreen”, alpha = 0.7) + scale_x_log10() + labs(title = “Product Price Distribution (log scale)”, x = “Price (log)”, y = “Frequency”) + theme_minimal() ###Product prices are right-skewed. Most products are priced between $50 and $200, with a small number of high‑price items. ##————————————————————————– ##2.3 Bivariate Analysis - Classification (is_returned) ###Delivery days vs return ggplot(df_clean, aes(x = is_returned, y = delivery_days, fill = is_returned)) + geom_boxplot() + labs(title = “Delivery Days by Return Status”, x = “Returned”, y = “Delivery Days”) + theme_minimal() ###Returned orders have longer average delivery days (~6 days) compared to non‑returned orders (~5 days). Longer delivery times are associated with higher return risk. ##————————————————————————– ###Discount vs return proportion df_clean %>% mutate(discount_group = cut(discount_percent, breaks = c(0,5,15,30,100), labels = c(“0-5%”,“5-15%”,“15-30%”,“30-100%”))) %>% ggplot(aes(x = discount_group, fill = is_returned)) + geom_bar(position = “fill”) + labs(title = “Return Proportion by Discount Level”, x = “Discount Percent”, y = “Proportion”) + theme_minimal() ###Orders with high discounts (30%+) show a slightly higher return proportion, but the difference is not strong. Lower discount groups have lower return rates. ##————————————————————————– ###Customer rating density by return ggplot(df_clean, aes(x = customer_rating, fill = is_returned)) + geom_density(alpha = 0.5) + labs(title = “Customer Rating Density by Return Status”, x = “Rating”, y = “Density”) + theme_minimal() ###Returned orders tend to have lower ratings (concentrated around 2.5–3.5) while non‑returned orders have higher ratings (3.5–4.5). Low rating is positively correlated with returns. ##————————————————————————– ##2.4 Bivariate Analysis - Regression (quantity) ### Price vs quantity ggplot(df_clean, aes(x = product_price, y = quantity)) + geom_point(alpha = 0.2) + geom_smooth(method = “lm”, se = FALSE, color = “red”) + labs(title = “Product Price vs Sales Quantity”, x = “Price”, y = “Quantity”) + theme_minimal() ###There is a weak negative correlation between price and quantity. Higher‑priced items tend to be purchased in slightly smaller quantities. ##————————————————————————– ### Average quantity by region region_qty <- df_clean %>% group_by(region) %>% summarise(avg_quantity = mean(quantity), .groups = “drop”) ggplot(region_qty, aes(x = reorder(region, avg_quantity), y = avg_quantity, fill = region)) + geom_col() + coord_flip() + labs(title = “Average Sales Quantity by Region”, x = “Region”, y = “Avg Quantity”) + theme_minimal() ###North America and Europe have the highest average sales quantity per order, while Africa and South America have relatively lower averages. This suggests region‑specific stock and promotion strategies. ##————————————————————————– ### Discount vs quantity (colored by return) ggplot(df_clean, aes(x = discount_percent, y = quantity, color = is_returned)) + geom_point(alpha = 0.5) + geom_smooth(se = FALSE, method = “lm”) + labs(title = “Discount vs Quantity (colored by Return)”, x = “Discount (%)”, y = “Quantity”) + theme_minimal() ###No clear linear relationship between discount and quantity. Returned orders are scattered across all discount levels without a clear pattern. ##————————————————————————– ##2.5 Correlation Matrix numeric_vars <- df_clean %>% select(where(is.numeric)) cor_matrix <- cor(numeric_vars, use = “complete.obs”) corrplot(cor_matrix, method = “number”, type = “upper”, tl.cex = 0.8, number.cex = 0.7) ##Revenue is highly correlated with quantity and price (by definition). Discount shows weak correlation with quantity; delivery days and return also show weak correlation (around 0.1–0.2). ##————————————————————————– ## 2.6 Time Trend Analysis df_clean %>% group_by(month = floor_date(order_date, “month”)) %>% summarise( total_quantity = sum(quantity), return_rate = mean(as.numeric(is_returned) - 1, na.rm = TRUE), .groups = “drop” ) %>% pivot_longer(cols = c(total_quantity, return_rate), names_to = “metric”, values_to = “value”) %>% ggplot(aes(x = month, y = value)) + geom_line(color = “steelblue”) + facet_wrap(~metric, scales = “free_y”) + labs(title = “Monthly Trends: Total Quantity and Return Rate”, x = “Month”, y = ““) + theme_minimal() ##Total sales quantity peaks in November and December (holiday season). The return rate remains stable around 5–7% throughout the year, with no significant increase during peak months.
##————————————————————————-
##2.7 Category-Level Insights category_return <- df_clean %>% group_by(product_category) %>% summarise( return_rate = mean(as.integer(is_returned) - 1, na.rm = TRUE), total_sales = sum(quantity * product_price * (1 - discount_percent/100)) ) %>% arrange(desc(return_rate))

print(category_return)

ggplot(category_return, aes(x = reorder(product_category, return_rate), y = return_rate, fill = product_category)) + geom_col() + coord_flip() + labs(title = “Return Rate by Product Category”, x = “Category”, y = “Return Rate”) + theme_minimal() ##Beauty and Fashion have the highest return rates (approximately 8–10%). Electronics and Automotive have the lowest return rates (approximately 3–5%). It is recommended to improve product descriptions or quality for high‑return categories.

#3. Key Findings cat(“========== EDA SUMMARY ==========”) cat(“1. Overall return rate:”, mean(as.numeric(df_clean$is_returned) - 1, na.rm = TRUE) * 100, "%\n") cat("2. Average quantity per order:", mean(df_clean$quantity), “”) cat(“3. Top 3 high-return categories:”) print(head(category_return %>% arrange(desc(return_rate)), 3)) cat(“. Correlation between delivery_days and return (point-biserial):”, cor(as.numeric(df_clean$delivery_days), as.numeric(df_clean$is_returned) - 1), “”) cat(“5. Best region for sales volume:”, region_qty$region[which.max(region_qty$avg_quantity)], “”)

Untitled

Group4

2026-06-06

1.Data Cleaning