library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read.csv(file.choose(), stringsAsFactors = FALSE)
#DATA PREPARATION
data$Order.Date <- as.Date(sub(" .*", "", data$Order.Date))
data$Ship.Date <- as.Date(sub(" .*", "", data$Ship.Date))
data <- data %>%
mutate(
Year = format(Order.Date, "%Y"),
Month = factor(format(Order.Date, "%b"), levels = month.abb),
YearMonth = format(Order.Date, "%Y-%m"),
DateNumeric = as.numeric(Order.Date),
Margin = Profit / Sales
)
str(data)
## 'data.frame': 51290 obs. of 31 variables:
## $ Category : chr "Office Supplies" "Office Supplies" "Office Supplies" "Office Supplies" ...
## $ City : chr "Los Angeles" "Los Angeles" "Los Angeles" "Los Angeles" ...
## $ Country : chr "United States" "United States" "United States" "United States" ...
## $ Customer.ID : chr "LS-172304" "MV-174854" "CS-121304" "CS-121304" ...
## $ Customer.Name : chr "Lycoris Saunders" "Mark Van Huff" "Chad Sievert" "Chad Sievert" ...
## $ Discount : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Market : chr "US" "US" "US" "US" ...
## $ 记录数 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Order.Date : Date, format: "2011-01-07" "2011-01-21" ...
## $ Order.ID : chr "CA-2011-130813" "CA-2011-148614" "CA-2011-118962" "CA-2011-118962" ...
## $ Order.Priority: chr "High" "Medium" "Medium" "Medium" ...
## $ Product.ID : chr "OFF-PA-10002005" "OFF-PA-10002893" "OFF-PA-10000659" "OFF-PA-10001144" ...
## $ Product.Name : chr "Xerox 225" "Wirebound Service Call Books, 5 1/2\" x 4\"" "Adams Phone Message Book, Professional, 400 Message Capacity, 5 3/6” x 11”" "Xerox 1913" ...
## $ Profit : num 9.33 9.29 9.84 53.26 3.11 ...
## $ Quantity : int 3 2 3 2 1 3 3 2 9 4 ...
## $ Region : chr "West" "West" "West" "West" ...
## $ Row.ID : int 36624 37033 31468 31469 32440 32890 35608 33709 35267 38866 ...
## $ Sales : int 19 19 21 111 6 13 19 12 54 49 ...
## $ Segment : chr "Consumer" "Consumer" "Consumer" "Consumer" ...
## $ Ship.Date : Date, format: "2011-01-09" "2011-01-26" ...
## $ Ship.Mode : chr "Second Class" "Standard Class" "Standard Class" "Standard Class" ...
## $ Shipping.Cost : num 4.37 0.94 1.81 4.59 1.32 2.39 1.15 0.92 6.98 1.99 ...
## $ State : chr "California" "California" "California" "California" ...
## $ Sub.Category : chr "Paper" "Paper" "Paper" "Paper" ...
## $ Year : chr "2011" "2011" "2011" "2011" ...
## $ Market2 : chr "North America" "North America" "North America" "North America" ...
## $ weeknum : int 2 4 32 32 40 43 45 46 48 50 ...
## $ Month : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 8 8 9 10 11 11 11 12 ...
## $ YearMonth : chr "2011-01" "2011-01" "2011-08" "2011-08" ...
## $ DateNumeric : num 14981 14995 15191 15191 15246 ...
## $ Margin : num 0.491 0.489 0.469 0.48 0.518 ...
#Q1: What is the overall sales, profit, and profit margin?
total_sales <- sum(data$Sales, na.rm = TRUE)
total_profit <- sum(data$Profit, na.rm = TRUE)
profit_margin <- (total_profit / total_sales) * 100
total_sales
## [1] 12642905
total_profit
## [1] 1467457
profit_margin
## [1] 11.60696
#Q2: How do sales and profit vary across different regions?
region_summary <- data %>%
group_by(Region) %>%
summarise(Sales = sum(Sales, na.rm = TRUE),
Profit = sum(Profit, na.rm = TRUE),
.groups = "drop")
region_summary
## # A tibble: 13 × 3
## Region Sales Profit
## <chr> <int> <dbl>
## 1 Africa 783776 88872.
## 2 Canada 66932 17817.
## 3 Caribbean 324281 34571.
## 4 Central 2822399 311404.
## 5 Central Asia 752839 132480.
## 6 EMEA 806184 43898.
## 7 East 678834 91523.
## 8 North 1248192 194598.
## 9 North Asia 848349 165578.
## 10 Oceania 1100207 120089.
## 11 South 1600960 140356.
## 12 Southeast Asia 884438 17852.
## 13 West 725514 108418.
#Visualization: Sales by Region
ggplot(region_summary, aes(x = Region, y = Sales)) +
geom_bar(stat = "identity", fill = "skyblue") +
ggtitle("Sales by Region")

#Q3: How have sales changed over time on a monthly basis?
monthly_sales <- data %>%
group_by(YearMonth) %>%
summarise(Sales = sum(Sales, na.rm = TRUE), .groups = "drop")
monthly_sales
## # A tibble: 48 × 2
## YearMonth Sales
## <chr> <int>
## 1 2011-01 98902
## 2 2011-02 91152
## 3 2011-03 145726
## 4 2011-04 116927
## 5 2011-05 146762
## 6 2011-06 215214
## 7 2011-07 115518
## 8 2011-08 207570
## 9 2011-09 290230
## 10 2011-10 199070
## # ℹ 38 more rows
#Visualization: Monthly Sales Trend
ggplot(monthly_sales, aes(x = as.Date(paste0(YearMonth, "-01")), y = Sales)) +
geom_line(color = "blue") +
theme(axis.text.x = element_text(angle = 90)) +
ggtitle("Monthly Sales Trend")

#Q4: Which product categories generate the highest sales?
category_sales <- data %>%
group_by(Category) %>%
summarise(Sales = sum(Sales, na.rm = TRUE), .groups = "drop")
category_sales
## # A tibble: 3 × 2
## Category Sales
## <chr> <int>
## 1 Furniture 4110884
## 2 Office Supplies 3787330
## 3 Technology 4744691
#Visualization: Sales by Category
ggplot(category_sales, aes(x = Category, y = Sales)) +
geom_bar(stat = "identity", fill = "orange") +
ggtitle("Sales by Category")

#Q5: What is the distribution of profit across transactions?
summary(data$Profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6599.98 0.00 9.24 28.61 36.81 8399.98
#Visualization: Profit Distribution
ggplot(data, aes(x = Profit)) +
geom_histogram(bins = 50, fill = "steelblue") +
ggtitle("Profit Distribution")

#Q6: What is the relationship between discount and profit?
cor(data$Discount, data$Profit)
## [1] -0.3164902
#Visualization: Discount vs Profit
ggplot(data, aes(x = Discount, y = Profit)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", color = "red") +
ggtitle("Discount vs Profit")
## `geom_smooth()` using formula = 'y ~ x'

#Q7: Which categories have the highest profit margins?
category_margin <- data %>%
group_by(Category) %>%
summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
mutate(Margin = Profit / Sales)
#Q8: Which region-category combinations are loss-making?
loss_segments <- data %>%
group_by(Region, Category) %>%
summarise(Profit = sum(Profit), .groups = "drop") %>%
filter(Profit < 0)
loss_segments
## # A tibble: 1 × 3
## Region Category Profit
## <chr> <chr> <dbl>
## 1 Southeast Asia Furniture -7270.
#Visualization: Loss-Making Segments
ggplot(loss_segments, aes(x = Category, y = Profit, fill = Region)) +
geom_bar(stat = "identity") +
ggtitle("Loss-Making Segments")

#Q9: How does shipping mode affect profit and cost?
ship_analysis <- data %>%
group_by(Ship.Mode) %>%
summarise(AvgProfit = mean(Profit), AvgShippingCost = mean(Shipping.Cost), .groups = "drop")
#Q10: Which individual products are consistently loss-making?
loss_products <- data %>%
group_by(Product.Name) %>%
summarise(Profit = sum(Profit), .groups = "drop") %>%
filter(Profit < 0)
#Q11: Which regions generate the highest total profit?
high_profit_regions <- region_summary %>% arrange(desc(Profit))
#Q12: Which products contribute the most to overall profit?
top_products <- data %>%
group_by(Product.Name) %>%
summarise(Profit = sum(Profit), .groups = "drop") %>%
arrange(desc(Profit))
#Q13: What is the overall business summary?
business_summary <- data %>%
summarise(
Total_Sales = sum(Sales),
Total_Profit = sum(Profit),
Avg_Discount = mean(Discount),
Avg_Shipping_Cost = mean(Shipping.Cost)
)
business_summary
## Total_Sales Total_Profit Avg_Discount Avg_Shipping_Cost
## 1 12642905 1467457 0.1429075 26.37582
#Q14: Which regions have the highest profit margins?
region_margin <- data %>%
group_by(Region) %>%
summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
mutate(Margin = Profit / Sales)
#Q15: Which category performs best within each region?
category_region_perf <- data %>%
group_by(Region, Category) %>%
summarise(Profit = sum(Profit), .groups = "drop")
#Q16: Which products generate high sales but low profit?
low_margin_products <- data %>%
group_by(Product.Name) %>%
summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
filter(Sales > mean(Sales) & Profit < mean(Profit))
#Q17: Which regions generate high sales but low profitability?
inefficient_regions <- data %>%
group_by(Region) %>%
summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
mutate(Margin = Profit / Sales)
#Q18: How do different discount levels impact average profit?
discount_groups <- data %>%
mutate(Level = ifelse(Discount < 0.2, "Low", "High")) %>%
group_by(Level) %>%
summarise(AvgProfit = mean(Profit), .groups = "drop")
#Q19: Which shipping mode is most cost-efficient?
ship_efficiency <- data %>%
group_by(Ship.Mode) %>%
summarise(Efficiency = mean(Profit) / mean(Shipping.Cost), .groups = "drop")
#Q20: Which categories are most affected by discounts?
category_discount <- data %>%
group_by(Category) %>%
summarise(Correlation = cor(Discount, Profit), .groups = "drop")
#Q21: What percentage of total profit comes from each region?
region_contribution <- data %>%
group_by(Region) %>%
summarise(Profit = sum(Profit), .groups = "drop") %>%
mutate(Share = Profit / sum(Profit))
#Visualization: Pie Chart (Profit Share)
pie(region_contribution$Share, labels = region_contribution$Region)

#Q22: What seasonal patterns exist in sales?
seasonal_sales <- data %>%
group_by(Month) %>%
summarise(Sales = sum(Sales), .groups = "drop")
#Q23: Which regions offer the highest average discounts?
region_discount <- data %>%
group_by(Region) %>%
summarise(AvgDiscount = mean(Discount), .groups = "drop")
#Q24: Are there high-quantity transactions that result in losses?
bulk_loss <- data %>%
filter(Quantity > quantile(Quantity, 0.75) & Profit < 0)
#Q25: What is the relationship between shipping cost and profit?
cor(data$Shipping.Cost, data$Profit)
## [1] 0.3544408
#Q26: How is category sales share distributed within each region?
category_share <- data %>%
group_by(Region, Category) %>%
summarise(Sales = sum(Sales), .groups = "drop") %>%
group_by(Region) %>%
mutate(Share = Sales / sum(Sales))
#Q27: Which products frequently generate losses?
risk_products <- data %>%
group_by(Product.Name) %>%
summarise(LossCount = sum(Profit < 0), .groups = "drop")
#Q28: Which category shows the highest variability in profit?
category_variance <- data %>%
group_by(Category) %>%
summarise(Variance = var(Profit), .groups = "drop")
#Q29: Which regions have the most stable profits?
region_stability <- data %>%
group_by(Region) %>%
summarise(SD = sd(Profit), .groups = "drop")
#Q30: Which products generate both high sales and high profit?
top_balanced_products <- data %>%
group_by(Product.Name) %>%
summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
filter(Sales > quantile(Sales, 0.9) & Profit > quantile(Profit, 0.9))
#Box Plot
ggplot(data, aes(x = Category, y = Profit)) +
geom_boxplot() +
ggtitle("Profit Distribution by Category")

#Scatter Plot (Sales vs Discount)
ggplot(data, aes(x = Discount, y = Sales)) +
geom_point(alpha = 0.3)

#Regression Plot
ggplot(data, aes(x = Discount, y = Sales)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula = 'y ~ x'

#Polynomial Plot
ggplot(data, aes(x = Discount, y = Sales)) +
geom_point(alpha = 0.3) +
stat_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue")

#ECDF
plot(ecdf(data$Profit), main = "ECDF of Profit")

#Pair Plot
pairs(data %>% select(Sales, Profit, Discount, Quantity, Shipping.Cost))
