Sales analysis

# load tidyverse
library(tidyverse)

# load data
library(readxl)
Superstore_clean <- read_excel("~/Downloads/Superstore_clean.xlsx")

Explore data

# display column names
names(Superstore_clean)

##  [1] "Row_ID"        "Order_ID"      "Order_Date"    "Ship_Date"    
##  [5] "Ship_Mode"     "Customer_ID"   "Customer_Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal_Code"  
## [13] "Region"        "Product_ID"    "Category"      "Sub_Category" 
## [17] "Product_Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"

# checking variable types
str(Superstore_clean)

## tibble [9,994 × 21] (S3: tbl_df/tbl/data.frame)
##  $ Row_ID       : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Order_ID     : chr [1:9994] "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
##  $ Order_Date   : POSIXct[1:9994], format: "2016-11-08" "2016-11-08" ...
##  $ Ship_Date    : POSIXct[1:9994], format: "2016-11-11" "2016-11-11" ...
##  $ Ship_Mode    : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
##  $ Customer_ID  : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
##  $ Customer_Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
##  $ Segment      : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
##  $ Country      : chr [1:9994] "United States" "United States" "United States" "United States" ...
##  $ City         : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
##  $ State        : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
##  $ Postal_Code  : num [1:9994] 42420 42420 90036 33311 33311 ...
##  $ Region       : chr [1:9994] "South" "South" "West" "South" ...
##  $ Product_ID   : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
##  $ Category     : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
##  $ Sub_Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
##  $ Product_Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
##  $ Sales        : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
##  $ Quantity     : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
##  $ Discount     : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
##  $ Profit       : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...

# view the first lines
head(Superstore_clean)

## # A tibble: 6 × 21
##   Row_ID Order_ID  Order_Date          Ship_Date           Ship_Mode Customer_ID
##    <dbl> <chr>     <dttm>              <dttm>              <chr>     <chr>      
## 1      1 CA-2016-… 2016-11-08 00:00:00 2016-11-11 00:00:00 Second C… CG-12520   
## 2      2 CA-2016-… 2016-11-08 00:00:00 2016-11-11 00:00:00 Second C… CG-12520   
## 3      3 CA-2016-… 2016-06-12 00:00:00 2016-06-16 00:00:00 Second C… DV-13045   
## 4      4 US-2015-… 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard… SO-20335   
## 5      5 US-2015-… 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard… SO-20335   
## 6      6 CA-2014-… 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard… BH-11710   
## # ℹ 15 more variables: Customer_Name <chr>, Segment <chr>, Country <chr>,
## #   City <chr>, State <chr>, Postal_Code <dbl>, Region <chr>, Product_ID <chr>,
## #   Category <chr>, Sub_Category <chr>, Product_Name <chr>, Sales <dbl>,
## #   Quantity <dbl>, Discount <dbl>, Profit <dbl>

Calculate item revenue

sales_calc <- Superstore_clean %>% 
  # use mutate to create new column
  mutate(
    item_revenue = (Sales * Quantity) * (1-Discount) 
  )

Group data by category, calculate total revenue and profit

sales_by_category <- sales_calc %>% 
  group_by(Category) %>% 
  summarise(
    total_category_sales = sum(item_revenue),
    total_category_profit = sum(Profit),
    total_category_quantity = sum(Quantity),
    .group = 'drop' #ungroup
  ) %>% 
  arrange(desc(total_category_sales))
print(sales_by_category)

## # A tibble: 3 × 5
##   Category     total_category_sales total_category_profit total_category_quant…¹
##   <chr>                       <dbl>                 <dbl>                  <dbl>
## 1 Technology               3491750.               145455.                   6939
## 2 Furniture                3229365.                18451.                   8028
## 3 Office Supp…             3173069.               122491.                  22906
## # ℹ abbreviated name: ¹total_category_quantity
## # ℹ 1 more variable: .group <chr>

Category analysis summary : 1. Technology - Highest revenue and highest profit. 2. Furniture - High revenue, but extremely low profit. 3. Office Suppliers - ~High revenue and the second highest profit.

Furniture category analysis to identify specific products and sub-categories that are reducing revenue.

# filter the data to include only "Furniture" and group by sub-category
furniture_profit_analysis <- sales_calc %>% 
  filter(Category == "Furniture") %>% 
  group_by(`Sub_Category`) %>%   # always use backtricks `` when column name contains spaces or other symbols _
  summarise(
    total_revenue = sum(item_revenue),
    total_profit = sum(Profit),
    # calculate the average discount %
    avg_discount = mean(Discount),
    .groups = 'drop'
  ) %>% 
  arrange(total_profit)
print(furniture_profit_analysis)

## # A tibble: 4 × 4
##   Sub_Category total_revenue total_profit avg_discount
##   <chr>                <dbl>        <dbl>        <dbl>
## 1 Tables             904949.      -17725.        0.261
## 2 Bookcases          486763.       -3473.        0.211
## 3 Furnishings        413820.       13059.        0.138
## 4 Chairs            1423833.       26590.        0.170

Analysis of loss-making products

# filter the data to include only products with negative profit
# filter by sub-category
# group by product_name
negative_profit <- sales_calc %>% 
  filter(Profit < 0) %>%  # keep only rows with negative profit
  filter(`Sub_Category` %in% c("Tables", "Bookcases")) %>%  #keep only problematic categories
  group_by(`Product_ID`, `Product_Name`, `Sub_Category`) %>% 
  summarise(
    total_revenue = sum(item_revenue),
    total_profit = sum(Profit),
    total_quantity = sum(Quantity),
    avg_discount = mean(Discount),
    .groups = 'drop'
  ) %>% 
  arrange(total_profit)
# table generator
knitr::kable(head(negative_profit, 10), 
             caption = "Top 10 Most Unprofitable Furniture Products", 
             digits = 2) # round all numerical columns to two decimal places

Top 10 Most Unprofitable Furniture Products
Product_ID	Product_Name	Sub_Category	total_revenue	total_profit	total_quantity	avg_discount
FUR-TA-10000198	Chromcraft Bull-Nose Wood Oval Conference Tables & Bases	Tables	46612.91	-3107.53	24	0.35
FUR-TA-10001889	Bush Advantage Collection Racetrack Conference Table	Tables	17821.06	-2545.26	25	0.41
FUR-BO-10004834	Riverside Palais Royal Lawyers Bookcase, Royale Cherry Finish	Bookcases	17309.85	-1982.21	11	0.41
FUR-TA-10001950	Balt Solid Wood Round Tables	Tables	10447.87	-1522.53	11	0.40
FUR-TA-10002958	Bevis Oval Conference Table, Walnut	Tables	13095.98	-1344.05	25	0.37
FUR-TA-10004154	Riverside Furniture Oval Coffee Table, Oval End Table, End Table with Drawer	Tables	23788.47	-1210.51	24	0.38
FUR-TA-10003473	Bretford Rectangular Conference Table Tops	Tables	12191.31	-1158.48	19	0.39
FUR-TA-10004289	BoxOffice By Design Rectangular and Half-Moon Meeting Room Tables	Tables	5280.62	-1148.44	15	0.48
FUR-TA-10004256	Bretford “Just In Time” Height-Adjustable Multi-Task Work Tables	Tables	14830.22	-1081.07	13	0.30
FUR-BO-10001972	O’Sullivan 4-Shelf Bookcase in Odessa Pine	Bookcases	2702.69	-1047.69	22	0.56

Time Trend Analysis

#summarize orders
orders_summary_final <- sales_calc %>% 
  group_by(`Order_ID`, `Order_Date`) %>% 
  summarise(
    total_sales_rev = sum(item_revenue),
    total_profit = sum(Profit),
    .groups = 'drop'
  )

#monthly data
monthly_data <- orders_summary_final %>% 
  mutate(
    #extract the start date of the month
    order_year_month = lubridate::floor_date(`Order_Date`, "month")
  ) %>% 
  group_by(order_year_month) %>% 
  summarise(
    monthly_sales_rev = sum(total_sales_rev),
    monthly_profit = sum(total_profit),
    monthly_orders = n(),
    .groups = 'drop'
  )
knitr::kable(head(monthly_data), caption = "First rows of the data aggregated by month", digits = 2)

First rows of the data aggregated by month
order_year_month	monthly_sales_rev	monthly_profit	monthly_orders
2014-01-01	71137.39	2450.19	32
2014-02-01	18847.85	862.31	28
2014-03-01	199717.60	498.73	71
2014-04-01	129873.56	3488.84	66
2014-05-01	96039.71	2738.71	69
2014-06-01	151510.92	4976.52	66

monthly_data %>% 
  ggplot(aes(x = order_year_month, y = monthly_profit)) + 
  geom_line(color = "#0072B2", size = 1) + 
  geom_point(color = "#0072B2") +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +
  labs(
    title = "Monthly Revenue Trend",
    x = "Month and Year",
    y = "Total Profit(USD)"
  ) + 
  theme_minimal() + 
  scale_y_continuous(labels = scales::dollar) + 
  scale_x_datetime( #adding months to the plot
    date_labels = "%b %Y", #month+year
    date_breaks = "3 months"
  ) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Sales analysis

Anetta Nikolajeva

2025-11-22