library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- read.csv(file.choose(), stringsAsFactors = FALSE)
#DATA PREPARATION
data$Order.Date <- as.Date(sub(" .*", "", data$Order.Date))
data$Ship.Date  <- as.Date(sub(" .*", "", data$Ship.Date))
data <- data %>%
  mutate(
    Year = format(Order.Date, "%Y"),
    Month = factor(format(Order.Date, "%b"), levels = month.abb),
    YearMonth = format(Order.Date, "%Y-%m"),
    DateNumeric = as.numeric(Order.Date),
    Margin = Profit / Sales
  )
str(data)
## 'data.frame':    51290 obs. of  31 variables:
##  $ Category      : chr  "Office Supplies" "Office Supplies" "Office Supplies" "Office Supplies" ...
##  $ City          : chr  "Los Angeles" "Los Angeles" "Los Angeles" "Los Angeles" ...
##  $ Country       : chr  "United States" "United States" "United States" "United States" ...
##  $ Customer.ID   : chr  "LS-172304" "MV-174854" "CS-121304" "CS-121304" ...
##  $ Customer.Name : chr  "Lycoris Saunders" "Mark Van Huff" "Chad Sievert" "Chad Sievert" ...
##  $ Discount      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Market        : chr  "US" "US" "US" "US" ...
##  $ 记录数        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Order.Date    : Date, format: "2011-01-07" "2011-01-21" ...
##  $ Order.ID      : chr  "CA-2011-130813" "CA-2011-148614" "CA-2011-118962" "CA-2011-118962" ...
##  $ Order.Priority: chr  "High" "Medium" "Medium" "Medium" ...
##  $ Product.ID    : chr  "OFF-PA-10002005" "OFF-PA-10002893" "OFF-PA-10000659" "OFF-PA-10001144" ...
##  $ Product.Name  : chr  "Xerox 225" "Wirebound Service Call Books, 5 1/2\" x 4\"" "Adams Phone Message Book, Professional, 400 Message Capacity, 5 3/6” x 11”" "Xerox 1913" ...
##  $ Profit        : num  9.33 9.29 9.84 53.26 3.11 ...
##  $ Quantity      : int  3 2 3 2 1 3 3 2 9 4 ...
##  $ Region        : chr  "West" "West" "West" "West" ...
##  $ Row.ID        : int  36624 37033 31468 31469 32440 32890 35608 33709 35267 38866 ...
##  $ Sales         : int  19 19 21 111 6 13 19 12 54 49 ...
##  $ Segment       : chr  "Consumer" "Consumer" "Consumer" "Consumer" ...
##  $ Ship.Date     : Date, format: "2011-01-09" "2011-01-26" ...
##  $ Ship.Mode     : chr  "Second Class" "Standard Class" "Standard Class" "Standard Class" ...
##  $ Shipping.Cost : num  4.37 0.94 1.81 4.59 1.32 2.39 1.15 0.92 6.98 1.99 ...
##  $ State         : chr  "California" "California" "California" "California" ...
##  $ Sub.Category  : chr  "Paper" "Paper" "Paper" "Paper" ...
##  $ Year          : chr  "2011" "2011" "2011" "2011" ...
##  $ Market2       : chr  "North America" "North America" "North America" "North America" ...
##  $ weeknum       : int  2 4 32 32 40 43 45 46 48 50 ...
##  $ Month         : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 8 8 9 10 11 11 11 12 ...
##  $ YearMonth     : chr  "2011-01" "2011-01" "2011-08" "2011-08" ...
##  $ DateNumeric   : num  14981 14995 15191 15191 15246 ...
##  $ Margin        : num  0.491 0.489 0.469 0.48 0.518 ...
#Q1: What is the overall sales, profit, and profit margin?
total_sales <- sum(data$Sales, na.rm = TRUE)
total_profit <- sum(data$Profit, na.rm = TRUE)
profit_margin <- (total_profit / total_sales) * 100

total_sales
## [1] 12642905
total_profit
## [1] 1467457
profit_margin
## [1] 11.60696
#Q2: How do sales and profit vary across different regions?
region_summary <- data %>%
  group_by(Region) %>%
  summarise(Sales = sum(Sales, na.rm = TRUE),
            Profit = sum(Profit, na.rm = TRUE),
            .groups = "drop")

region_summary
## # A tibble: 13 × 3
##    Region           Sales  Profit
##    <chr>            <int>   <dbl>
##  1 Africa          783776  88872.
##  2 Canada           66932  17817.
##  3 Caribbean       324281  34571.
##  4 Central        2822399 311404.
##  5 Central Asia    752839 132480.
##  6 EMEA            806184  43898.
##  7 East            678834  91523.
##  8 North          1248192 194598.
##  9 North Asia      848349 165578.
## 10 Oceania        1100207 120089.
## 11 South          1600960 140356.
## 12 Southeast Asia  884438  17852.
## 13 West            725514 108418.
#Visualization: Sales by Region
ggplot(region_summary, aes(x = Region, y = Sales)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  ggtitle("Sales by Region")

#Q3: How have sales changed over time on a monthly basis?
monthly_sales <- data %>%
  group_by(YearMonth) %>%
  summarise(Sales = sum(Sales, na.rm = TRUE), .groups = "drop")

monthly_sales
## # A tibble: 48 × 2
##    YearMonth  Sales
##    <chr>      <int>
##  1 2011-01    98902
##  2 2011-02    91152
##  3 2011-03   145726
##  4 2011-04   116927
##  5 2011-05   146762
##  6 2011-06   215214
##  7 2011-07   115518
##  8 2011-08   207570
##  9 2011-09   290230
## 10 2011-10   199070
## # ℹ 38 more rows
#Visualization: Monthly Sales Trend
ggplot(monthly_sales, aes(x = as.Date(paste0(YearMonth, "-01")), y = Sales)) +
  geom_line(color = "blue") +
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("Monthly Sales Trend")

#Q4: Which product categories generate the highest sales?
category_sales <- data %>%
  group_by(Category) %>%
  summarise(Sales = sum(Sales, na.rm = TRUE), .groups = "drop")

category_sales
## # A tibble: 3 × 2
##   Category          Sales
##   <chr>             <int>
## 1 Furniture       4110884
## 2 Office Supplies 3787330
## 3 Technology      4744691
#Visualization: Sales by Category
ggplot(category_sales, aes(x = Category, y = Sales)) +
  geom_bar(stat = "identity", fill = "orange") +
  ggtitle("Sales by Category")

#Q5: What is the distribution of profit across transactions?
summary(data$Profit)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -6599.98     0.00     9.24    28.61    36.81  8399.98
#Visualization: Profit Distribution
ggplot(data, aes(x = Profit)) +
  geom_histogram(bins = 50, fill = "steelblue") +
  ggtitle("Profit Distribution")

#Q6: What is the relationship between discount and profit?
cor(data$Discount, data$Profit)
## [1] -0.3164902
#Visualization: Discount vs Profit
ggplot(data, aes(x = Discount, y = Profit)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", color = "red") +
  ggtitle("Discount vs Profit")
## `geom_smooth()` using formula = 'y ~ x'

#Q7: Which categories have the highest profit margins?
category_margin <- data %>%
  group_by(Category) %>%
  summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
  mutate(Margin = Profit / Sales)
#Q8: Which region-category combinations are loss-making?
loss_segments <- data %>%
  group_by(Region, Category) %>%
  summarise(Profit = sum(Profit), .groups = "drop") %>%
  filter(Profit < 0)

loss_segments
## # A tibble: 1 × 3
##   Region         Category  Profit
##   <chr>          <chr>      <dbl>
## 1 Southeast Asia Furniture -7270.
#Visualization: Loss-Making Segments
ggplot(loss_segments, aes(x = Category, y = Profit, fill = Region)) +
  geom_bar(stat = "identity") +
  ggtitle("Loss-Making Segments")

#Q9: How does shipping mode affect profit and cost?
ship_analysis <- data %>%
  group_by(Ship.Mode) %>%
  summarise(AvgProfit = mean(Profit), AvgShippingCost = mean(Shipping.Cost), .groups = "drop")
#Q10: Which individual products are consistently loss-making?
loss_products <- data %>%
  group_by(Product.Name) %>%
  summarise(Profit = sum(Profit), .groups = "drop") %>%
  filter(Profit < 0)
#Q11: Which regions generate the highest total profit?
high_profit_regions <- region_summary %>% arrange(desc(Profit))
#Q12: Which products contribute the most to overall profit?
top_products <- data %>%
  group_by(Product.Name) %>%
  summarise(Profit = sum(Profit), .groups = "drop") %>%
  arrange(desc(Profit))
#Q13: What is the overall business summary?
business_summary <- data %>%
  summarise(
    Total_Sales = sum(Sales),
    Total_Profit = sum(Profit),
    Avg_Discount = mean(Discount),
    Avg_Shipping_Cost = mean(Shipping.Cost)
  )

business_summary
##   Total_Sales Total_Profit Avg_Discount Avg_Shipping_Cost
## 1    12642905      1467457    0.1429075          26.37582
#Q14: Which regions have the highest profit margins?
region_margin <- data %>%
  group_by(Region) %>%
  summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
  mutate(Margin = Profit / Sales)
#Q15: Which category performs best within each region?
category_region_perf <- data %>%
  group_by(Region, Category) %>%
  summarise(Profit = sum(Profit), .groups = "drop")
#Q16: Which products generate high sales but low profit?
low_margin_products <- data %>%
  group_by(Product.Name) %>%
  summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
  filter(Sales > mean(Sales) & Profit < mean(Profit))
#Q17: Which regions generate high sales but low profitability?
inefficient_regions <- data %>%
  group_by(Region) %>%
  summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
  mutate(Margin = Profit / Sales)
#Q18: How do different discount levels impact average profit?
discount_groups <- data %>%
  mutate(Level = ifelse(Discount < 0.2, "Low", "High")) %>%
  group_by(Level) %>%
  summarise(AvgProfit = mean(Profit), .groups = "drop")
#Q19: Which shipping mode is most cost-efficient?
ship_efficiency <- data %>%
  group_by(Ship.Mode) %>%
  summarise(Efficiency = mean(Profit) / mean(Shipping.Cost), .groups = "drop")
#Q20: Which categories are most affected by discounts?
category_discount <- data %>%
  group_by(Category) %>%
  summarise(Correlation = cor(Discount, Profit), .groups = "drop")
#Q21: What percentage of total profit comes from each region?
region_contribution <- data %>%
  group_by(Region) %>%
  summarise(Profit = sum(Profit), .groups = "drop") %>%
  mutate(Share = Profit / sum(Profit))
#Visualization: Pie Chart (Profit Share)
pie(region_contribution$Share, labels = region_contribution$Region)

#Q22: What seasonal patterns exist in sales?
seasonal_sales <- data %>%
  group_by(Month) %>%
  summarise(Sales = sum(Sales), .groups = "drop")
#Q23: Which regions offer the highest average discounts?
region_discount <- data %>%
  group_by(Region) %>%
  summarise(AvgDiscount = mean(Discount), .groups = "drop")
#Q24: Are there high-quantity transactions that result in losses?
bulk_loss <- data %>%
  filter(Quantity > quantile(Quantity, 0.75) & Profit < 0)
#Q25: What is the relationship between shipping cost and profit?
cor(data$Shipping.Cost, data$Profit)
## [1] 0.3544408
#Q26: How is category sales share distributed within each region?
category_share <- data %>%
  group_by(Region, Category) %>%
  summarise(Sales = sum(Sales), .groups = "drop") %>%
  group_by(Region) %>%
  mutate(Share = Sales / sum(Sales))
#Q27: Which products frequently generate losses?
risk_products <- data %>%
  group_by(Product.Name) %>%
  summarise(LossCount = sum(Profit < 0), .groups = "drop")
#Q28: Which category shows the highest variability in profit?
category_variance <- data %>%
  group_by(Category) %>%
  summarise(Variance = var(Profit), .groups = "drop")
#Q29: Which regions have the most stable profits?
region_stability <- data %>%
  group_by(Region) %>%
  summarise(SD = sd(Profit), .groups = "drop")
#Q30: Which products generate both high sales and high profit?
top_balanced_products <- data %>%
  group_by(Product.Name) %>%
  summarise(Sales = sum(Sales), Profit = sum(Profit), .groups = "drop") %>%
  filter(Sales > quantile(Sales, 0.9) & Profit > quantile(Profit, 0.9))
#Box Plot
ggplot(data, aes(x = Category, y = Profit)) +
  geom_boxplot() +
  ggtitle("Profit Distribution by Category")

#Scatter Plot (Sales vs Discount)
ggplot(data, aes(x = Discount, y = Sales)) +
  geom_point(alpha = 0.3)

#Regression Plot
ggplot(data, aes(x = Discount, y = Sales)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula = 'y ~ x'

#Polynomial Plot
ggplot(data, aes(x = Discount, y = Sales)) +
  geom_point(alpha = 0.3) +
  stat_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue")

#ECDF
plot(ecdf(data$Profit), main = "ECDF of Profit")

#Pair Plot
pairs(data %>% select(Sales, Profit, Discount, Quantity, Shipping.Cost))