library(dplyr)
## 
## Attachement du package : 'dplyr'
## Les objets suivants sont masqués depuis 'package:stats':
## 
##     filter, lag
## Les objets suivants sont masqués depuis 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attachement du package : 'lubridate'
## Les objets suivants sont masqués depuis 'package:base':
## 
##     date, intersect, setdiff, union
library(scales)
library(ggcorrplot)
## Warning: le package 'ggcorrplot' a été compilé avec la version R 4.5.2
library(readr)
## 
## Attachement du package : 'readr'
## L'objet suivant est masqué depuis 'package:scales':
## 
##     col_factor

1.Introduction

This report presents a full analysis of the Sample Superstore dataset.

Goal:
Clean the data, calculate KPIs, perform exploratory data analysis,visualize trends,and draw actionable insights.

2.Data Cleaning

The dataset contains sales records from a superstore.
We will :
-Convert dates to Date format

-Add Year and Month columns

-Check missing values and duplicates

-Calculate profit margin

data <- read.csv("Sample - Superstore.csv")

cleaned_data <- data %>% mutate(
Order.Date = as_date(Order.Date, format = "%m/%d/%Y"),
Ship.Date  = as_date(Ship.Date, format = "%m/%d/%Y"),
Year  = year(Order.Date),
Month = month(Order.Date, label = TRUE),
Profit_Ratio = round(Profit / Sales, 2)
)

Missing values

colSums(is.na(cleaned_data))
##        Row.ID      Order.ID    Order.Date     Ship.Date     Ship.Mode 
##             0             0             0             0             0 
##   Customer.ID Customer.Name       Segment       Country          City 
##             0             0             0             0             0 
##         State   Postal.Code        Region    Product.ID      Category 
##             0             0             0             0             0 
##  Sub.Category  Product.Name         Sales      Quantity      Discount 
##             0             0             0             0             0 
##        Profit          Year         Month  Profit_Ratio 
##             0             0             0             0

Duplicates

anyDuplicated(cleaned_data)
## [1] 0

3.KPIs

We calculate the main Key Performance Indicators (KPIs) for the superstore.

sales_total  <- sum(cleaned_data$Sales)
profit_total <- sum(cleaned_data$Profit)
profit_margin <- round((profit_total / sales_total) * 100, 2)

KPI <- data.frame(
Total_Sales   = paste0(label_comma()(sales_total), "$"),
Total_Profit  = paste0(label_comma()(profit_total), "$"),
Profit_Margin = paste0(profit_margin, "%")
)
KPI  
##   Total_Sales Total_Profit Profit_Margin
## 1  2,297,201$     286,397$        12.47%

Insights

Total Sales and Profit give an overview of business performance

Profit Margin indicates efficiency in generating profit from sales

4.Exploratory Analysis

summary(cleaned_data$Sales)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.444    17.280    54.490   229.858   209.940 22638.480
summary(cleaned_data$Profit)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -6599.978     1.729     8.666    28.657    29.364  8399.976
table(cleaned_data$Region)
## 
## Central    East   South    West 
##    2323    2848    1620    3203
table(cleaned_data$Category)
## 
##       Furniture Office Supplies      Technology 
##            2121            6026            1847

Observations:

Some regions and categories outperform others

Distribution of Sales and Profit is skewed → few high-value orders

5.Visualizations

5.1 Sales by Category

category_summary <- cleaned_data %>%
group_by(Category) %>%
summarise(Total_Sales = sum(Sales))
category_summary
## # A tibble: 3 × 2
##   Category        Total_Sales
##   <chr>                 <dbl>
## 1 Furniture           742000.
## 2 Office Supplies     719047.
## 3 Technology          836154.
ggplot(category_summary, aes(x = reorder(Category, -Total_Sales), y = Total_Sales)) +
geom_bar(stat="identity", fill="#600D07") +
geom_text(aes(label = label_comma()(Total_Sales)), vjust=-0.4, color="#3C3C4C", size=3) +
theme_minimal() +
ggtitle("Sales by Category") +
xlab("Category") + ylab("Total Sales")

Insights:

-Technology products generate the highest sales
-Furniture has moderate sales
-Office Supplies is the lowest
-Suggest focusing marketing on Technology products

5.2 Sales by Region

region_summary <- cleaned_data %>%
group_by(Region) %>%
summarise(Total_Sales = sum(Sales))
region_summary
## # A tibble: 4 × 2
##   Region  Total_Sales
##   <chr>         <dbl>
## 1 Central     501240.
## 2 East        678781.
## 3 South       391722.
## 4 West        725458.
ggplot(region_summary, aes(x = reorder(Region, -Total_Sales), y = Total_Sales)) +
geom_bar(stat="identity", fill="#0B4F0B") +
geom_text(aes(label = label_comma()(Total_Sales)), vjust=-0.4, color="#3C3C4C", size=3) +
theme_minimal() +
ggtitle("Sales by Region") +
xlab("Region") + ylab("Total Sales")

Insights:
-West region leads in total sales
-East and South have lower sales
-Suggest focusing expansion efforts in top regions

5.3 Sales per Month

Month_summary <- cleaned_data %>%
group_by(Month) %>%
summarise(Total_Sales = sum(Sales))

ggplot(Month_summary, aes(x = Month, y = Total_Sales, group = 1)) +
  geom_line(color = "#0B4F0B", linewidth = 1.2) +
  geom_point(color = "#600D07", size = 2) +  # use size instead of linewidth
  geom_text(aes(label = label_comma()(Total_Sales)), vjust = -0.8, size = 3) +  # size instead of linewidth
  theme_minimal() +
  ggtitle("Sales per Month") +
  xlab("Month") + ylab("Sales")

Insights:

Sales peak sharply in November (300,000) but drop lowest in June–July (~95,000), showing strong seasonality with pre-holiday surge and a mid-year slowdown.

5.4 Correlation Heatmap

corr_matrix <- cleaned_data %>%
select(Sales, Profit, Discount, Quantity) %>%
cor()

suppressWarnings(
  ggcorrplot(corr_matrix,
             hc.order = TRUE,
             type = "lower",
             lab = TRUE,
             lab_size = 4,
             colors = c("#0B4F0B", "#F5F0E6", "#600D07"),
             title = "Correlation Matrix",
             ggtheme = theme_minimal())
)

Insights:

Sales & Profit: strong positive correlation (0.48)

Discount & Profit: weak negative correlation (-0.22)

Quantity shows limited linear relationships

5.5 Profit vs Sales

ggplot() +
  geom_point(aes(x = 1:nrow(cleaned_data), y = cleaned_data$Sales), color = "#600D07", size = 3) +   # Sales points
  geom_point(aes(x = 1:nrow(cleaned_data), y = cleaned_data$Profit), color = "#D0716D", size = 3) +  # Profit points
  theme_minimal()+
  xlab("Profit")+
  ylab("Sales")+
  theme_minimal()+
  ggtitle("Sales vs profit")

Insights :
Higher sales usually mean higher profits, but some low-sales items have high margins.
This shows we could improve overall profit by adjusting the product mix.

5.6 Profit by Category – Boxplot

cleaned_data$Category <- factor(cleaned_data$Category,
levels=c("Furniture","Office Supplies","Technology"))

ggplot(cleaned_data, aes(x=Category, y=Profit)) +
geom_boxplot(fill="#D88A1A") +
theme_minimal() +
ggtitle("Profit by Category") +
xlab("Category") + ylab("Profit")

Insights:

Technology is the most profitable category, while Office Supplies operates at a loss.This suggests a need to review pricing, costs, or sales focus for underperforming categories.

6.Insights & Recommendations

-Focus marketing and inventory on Technology products

-Reduce excessive discounts that slightly reduce profit

-Expand in high-performing regions like West

-Monitor peak sales months for promotions and stock planning

Conclusion
The analysis shows that Technology products and the West region drive the highest sales and profits, while other categories and regions lag behind. Sales peak in November and drop mid-year, revealing seasonal trends. Optimizing the product mix and discount strategies could further increase profitability. These insights can guide marketing, inventory, and expansion decisions to improve overall business performance.