# load tidyverse
library(tidyverse)
# load data
library(readxl)
Superstore_clean <- read_excel("~/Downloads/Superstore_clean.xlsx")

Explore data

# display column names
names(Superstore_clean)
##  [1] "Row_ID"        "Order_ID"      "Order_Date"    "Ship_Date"    
##  [5] "Ship_Mode"     "Customer_ID"   "Customer_Name" "Segment"      
##  [9] "Country"       "City"          "State"         "Postal_Code"  
## [13] "Region"        "Product_ID"    "Category"      "Sub_Category" 
## [17] "Product_Name"  "Sales"         "Quantity"      "Discount"     
## [21] "Profit"
# checking variable types
str(Superstore_clean)
## tibble [9,994 × 21] (S3: tbl_df/tbl/data.frame)
##  $ Row_ID       : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Order_ID     : chr [1:9994] "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
##  $ Order_Date   : POSIXct[1:9994], format: "2016-11-08" "2016-11-08" ...
##  $ Ship_Date    : POSIXct[1:9994], format: "2016-11-11" "2016-11-11" ...
##  $ Ship_Mode    : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
##  $ Customer_ID  : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
##  $ Customer_Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
##  $ Segment      : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
##  $ Country      : chr [1:9994] "United States" "United States" "United States" "United States" ...
##  $ City         : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
##  $ State        : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
##  $ Postal_Code  : num [1:9994] 42420 42420 90036 33311 33311 ...
##  $ Region       : chr [1:9994] "South" "South" "West" "South" ...
##  $ Product_ID   : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
##  $ Category     : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
##  $ Sub_Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
##  $ Product_Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
##  $ Sales        : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
##  $ Quantity     : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
##  $ Discount     : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
##  $ Profit       : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...
# view the first lines
head(Superstore_clean)
## # A tibble: 6 × 21
##   Row_ID Order_ID  Order_Date          Ship_Date           Ship_Mode Customer_ID
##    <dbl> <chr>     <dttm>              <dttm>              <chr>     <chr>      
## 1      1 CA-2016-… 2016-11-08 00:00:00 2016-11-11 00:00:00 Second C… CG-12520   
## 2      2 CA-2016-… 2016-11-08 00:00:00 2016-11-11 00:00:00 Second C… CG-12520   
## 3      3 CA-2016-… 2016-06-12 00:00:00 2016-06-16 00:00:00 Second C… DV-13045   
## 4      4 US-2015-… 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard… SO-20335   
## 5      5 US-2015-… 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard… SO-20335   
## 6      6 CA-2014-… 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard… BH-11710   
## # ℹ 15 more variables: Customer_Name <chr>, Segment <chr>, Country <chr>,
## #   City <chr>, State <chr>, Postal_Code <dbl>, Region <chr>, Product_ID <chr>,
## #   Category <chr>, Sub_Category <chr>, Product_Name <chr>, Sales <dbl>,
## #   Quantity <dbl>, Discount <dbl>, Profit <dbl>

Calculate item revenue

sales_calc <- Superstore_clean %>% 
  # use mutate to create new column
  mutate(
    item_revenue = (Sales * Quantity) * (1-Discount) 
  )

Group data by category, calculate total revenue and profit

sales_by_category <- sales_calc %>% 
  group_by(Category) %>% 
  summarise(
    total_category_sales = sum(item_revenue),
    total_category_profit = sum(Profit),
    total_category_quantity = sum(Quantity),
    .group = 'drop' #ungroup
  ) %>% 
  arrange(desc(total_category_sales))
print(sales_by_category)
## # A tibble: 3 × 5
##   Category     total_category_sales total_category_profit total_category_quant…¹
##   <chr>                       <dbl>                 <dbl>                  <dbl>
## 1 Technology               3491750.               145455.                   6939
## 2 Furniture                3229365.                18451.                   8028
## 3 Office Supp…             3173069.               122491.                  22906
## # ℹ abbreviated name: ¹​total_category_quantity
## # ℹ 1 more variable: .group <chr>

Category analysis summary : 1. Technology - Highest revenue and highest profit. 2. Furniture - High revenue, but extremely low profit. 3. Office Suppliers - ~High revenue and the second highest profit.

Furniture category analysis to identify specific products and sub-categories that are reducing revenue.

# filter the data to include only "Furniture" and group by sub-category
furniture_profit_analysis <- sales_calc %>% 
  filter(Category == "Furniture") %>% 
  group_by(`Sub_Category`) %>%   # always use backtricks `` when column name contains spaces or other symbols _
  summarise(
    total_revenue = sum(item_revenue),
    total_profit = sum(Profit),
    # calculate the average discount %
    avg_discount = mean(Discount),
    .groups = 'drop'
  ) %>% 
  arrange(total_profit)
print(furniture_profit_analysis)
## # A tibble: 4 × 4
##   Sub_Category total_revenue total_profit avg_discount
##   <chr>                <dbl>        <dbl>        <dbl>
## 1 Tables             904949.      -17725.        0.261
## 2 Bookcases          486763.       -3473.        0.211
## 3 Furnishings        413820.       13059.        0.138
## 4 Chairs            1423833.       26590.        0.170

Analysis of loss-making products

# filter the data to include only products with negative profit
# filter by sub-category
# group by product_name
negative_profit <- sales_calc %>% 
  filter(Profit < 0) %>%  # keep only rows with negative profit
  filter(`Sub_Category` %in% c("Tables", "Bookcases")) %>%  #keep only problematic categories
  group_by(`Product_ID`, `Product_Name`, `Sub_Category`) %>% 
  summarise(
    total_revenue = sum(item_revenue),
    total_profit = sum(Profit),
    total_quantity = sum(Quantity),
    avg_discount = mean(Discount),
    .groups = 'drop'
  ) %>% 
  arrange(total_profit)
# table generator
knitr::kable(head(negative_profit, 10), 
             caption = "Top 10 Most Unprofitable Furniture Products", 
             digits = 2) # round all numerical columns to two decimal places
Top 10 Most Unprofitable Furniture Products
Product_ID Product_Name Sub_Category total_revenue total_profit total_quantity avg_discount
FUR-TA-10000198 Chromcraft Bull-Nose Wood Oval Conference Tables & Bases Tables 46612.91 -3107.53 24 0.35
FUR-TA-10001889 Bush Advantage Collection Racetrack Conference Table Tables 17821.06 -2545.26 25 0.41
FUR-BO-10004834 Riverside Palais Royal Lawyers Bookcase, Royale Cherry Finish Bookcases 17309.85 -1982.21 11 0.41
FUR-TA-10001950 Balt Solid Wood Round Tables Tables 10447.87 -1522.53 11 0.40
FUR-TA-10002958 Bevis Oval Conference Table, Walnut Tables 13095.98 -1344.05 25 0.37
FUR-TA-10004154 Riverside Furniture Oval Coffee Table, Oval End Table, End Table with Drawer Tables 23788.47 -1210.51 24 0.38
FUR-TA-10003473 Bretford Rectangular Conference Table Tops Tables 12191.31 -1158.48 19 0.39
FUR-TA-10004289 BoxOffice By Design Rectangular and Half-Moon Meeting Room Tables Tables 5280.62 -1148.44 15 0.48
FUR-TA-10004256 Bretford “Just In Time” Height-Adjustable Multi-Task Work Tables Tables 14830.22 -1081.07 13 0.30
FUR-BO-10001972 O’Sullivan 4-Shelf Bookcase in Odessa Pine Bookcases 2702.69 -1047.69 22 0.56

Time Trend Analysis

#summarize orders
orders_summary_final <- sales_calc %>% 
  group_by(`Order_ID`, `Order_Date`) %>% 
  summarise(
    total_sales_rev = sum(item_revenue),
    total_profit = sum(Profit),
    .groups = 'drop'
  )
#monthly data
monthly_data <- orders_summary_final %>% 
  mutate(
    #extract the start date of the month
    order_year_month = lubridate::floor_date(`Order_Date`, "month")
  ) %>% 
  group_by(order_year_month) %>% 
  summarise(
    monthly_sales_rev = sum(total_sales_rev),
    monthly_profit = sum(total_profit),
    monthly_orders = n(),
    .groups = 'drop'
  )
knitr::kable(head(monthly_data), caption = "First rows of the data aggregated by month", digits = 2)
First rows of the data aggregated by month
order_year_month monthly_sales_rev monthly_profit monthly_orders
2014-01-01 71137.39 2450.19 32
2014-02-01 18847.85 862.31 28
2014-03-01 199717.60 498.73 71
2014-04-01 129873.56 3488.84 66
2014-05-01 96039.71 2738.71 69
2014-06-01 151510.92 4976.52 66
monthly_data %>% 
  ggplot(aes(x = order_year_month, y = monthly_profit)) + 
  geom_line(color = "#0072B2", size = 1) + 
  geom_point(color = "#0072B2") +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +
  labs(
    title = "Monthly Revenue Trend",
    x = "Month and Year",
    y = "Total Profit(USD)"
  ) + 
  theme_minimal() + 
  scale_y_continuous(labels = scales::dollar) + 
  scale_x_datetime( #adding months to the plot
    date_labels = "%b %Y", #month+year
    date_breaks = "3 months"
  ) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))