# load tidyverse
library(tidyverse)
# load data
library(readxl)
Superstore_clean <- read_excel("~/Downloads/Superstore_clean.xlsx")
Explore data
# display column names
names(Superstore_clean)
## [1] "Row_ID" "Order_ID" "Order_Date" "Ship_Date"
## [5] "Ship_Mode" "Customer_ID" "Customer_Name" "Segment"
## [9] "Country" "City" "State" "Postal_Code"
## [13] "Region" "Product_ID" "Category" "Sub_Category"
## [17] "Product_Name" "Sales" "Quantity" "Discount"
## [21] "Profit"
# checking variable types
str(Superstore_clean)
## tibble [9,994 × 21] (S3: tbl_df/tbl/data.frame)
## $ Row_ID : num [1:9994] 1 2 3 4 5 6 7 8 9 10 ...
## $ Order_ID : chr [1:9994] "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order_Date : POSIXct[1:9994], format: "2016-11-08" "2016-11-08" ...
## $ Ship_Date : POSIXct[1:9994], format: "2016-11-11" "2016-11-11" ...
## $ Ship_Mode : chr [1:9994] "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer_ID : chr [1:9994] "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Customer_Name: chr [1:9994] "Claire Gute" "Claire Gute" "Darrin Van Huff" "Sean O'Donnell" ...
## $ Segment : chr [1:9994] "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Country : chr [1:9994] "United States" "United States" "United States" "United States" ...
## $ City : chr [1:9994] "Henderson" "Henderson" "Los Angeles" "Fort Lauderdale" ...
## $ State : chr [1:9994] "Kentucky" "Kentucky" "California" "Florida" ...
## $ Postal_Code : num [1:9994] 42420 42420 90036 33311 33311 ...
## $ Region : chr [1:9994] "South" "South" "West" "South" ...
## $ Product_ID : chr [1:9994] "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr [1:9994] "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub_Category : chr [1:9994] "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product_Name : chr [1:9994] "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num [1:9994] 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : num [1:9994] 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num [1:9994] 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num [1:9994] 41.91 219.58 6.87 -383.03 2.52 ...
# view the first lines
head(Superstore_clean)
## # A tibble: 6 × 21
## Row_ID Order_ID Order_Date Ship_Date Ship_Mode Customer_ID
## <dbl> <chr> <dttm> <dttm> <chr> <chr>
## 1 1 CA-2016-… 2016-11-08 00:00:00 2016-11-11 00:00:00 Second C… CG-12520
## 2 2 CA-2016-… 2016-11-08 00:00:00 2016-11-11 00:00:00 Second C… CG-12520
## 3 3 CA-2016-… 2016-06-12 00:00:00 2016-06-16 00:00:00 Second C… DV-13045
## 4 4 US-2015-… 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard… SO-20335
## 5 5 US-2015-… 2015-10-11 00:00:00 2015-10-18 00:00:00 Standard… SO-20335
## 6 6 CA-2014-… 2014-06-09 00:00:00 2014-06-14 00:00:00 Standard… BH-11710
## # ℹ 15 more variables: Customer_Name <chr>, Segment <chr>, Country <chr>,
## # City <chr>, State <chr>, Postal_Code <dbl>, Region <chr>, Product_ID <chr>,
## # Category <chr>, Sub_Category <chr>, Product_Name <chr>, Sales <dbl>,
## # Quantity <dbl>, Discount <dbl>, Profit <dbl>
Calculate item revenue
sales_calc <- Superstore_clean %>%
# use mutate to create new column
mutate(
item_revenue = (Sales * Quantity) * (1-Discount)
)
Group data by category, calculate total revenue and profit
sales_by_category <- sales_calc %>%
group_by(Category) %>%
summarise(
total_category_sales = sum(item_revenue),
total_category_profit = sum(Profit),
total_category_quantity = sum(Quantity),
.group = 'drop' #ungroup
) %>%
arrange(desc(total_category_sales))
print(sales_by_category)
## # A tibble: 3 × 5
## Category total_category_sales total_category_profit total_category_quant…¹
## <chr> <dbl> <dbl> <dbl>
## 1 Technology 3491750. 145455. 6939
## 2 Furniture 3229365. 18451. 8028
## 3 Office Supp… 3173069. 122491. 22906
## # ℹ abbreviated name: ¹total_category_quantity
## # ℹ 1 more variable: .group <chr>
Category analysis summary : 1. Technology - Highest revenue and highest profit. 2. Furniture - High revenue, but extremely low profit. 3. Office Suppliers - ~High revenue and the second highest profit.
Furniture category analysis to identify specific products and sub-categories that are reducing revenue.
# filter the data to include only "Furniture" and group by sub-category
furniture_profit_analysis <- sales_calc %>%
filter(Category == "Furniture") %>%
group_by(`Sub_Category`) %>% # always use backtricks `` when column name contains spaces or other symbols _
summarise(
total_revenue = sum(item_revenue),
total_profit = sum(Profit),
# calculate the average discount %
avg_discount = mean(Discount),
.groups = 'drop'
) %>%
arrange(total_profit)
print(furniture_profit_analysis)
## # A tibble: 4 × 4
## Sub_Category total_revenue total_profit avg_discount
## <chr> <dbl> <dbl> <dbl>
## 1 Tables 904949. -17725. 0.261
## 2 Bookcases 486763. -3473. 0.211
## 3 Furnishings 413820. 13059. 0.138
## 4 Chairs 1423833. 26590. 0.170
Analysis of loss-making products
# filter the data to include only products with negative profit
# filter by sub-category
# group by product_name
negative_profit <- sales_calc %>%
filter(Profit < 0) %>% # keep only rows with negative profit
filter(`Sub_Category` %in% c("Tables", "Bookcases")) %>% #keep only problematic categories
group_by(`Product_ID`, `Product_Name`, `Sub_Category`) %>%
summarise(
total_revenue = sum(item_revenue),
total_profit = sum(Profit),
total_quantity = sum(Quantity),
avg_discount = mean(Discount),
.groups = 'drop'
) %>%
arrange(total_profit)
# table generator
knitr::kable(head(negative_profit, 10),
caption = "Top 10 Most Unprofitable Furniture Products",
digits = 2) # round all numerical columns to two decimal places
| Product_ID | Product_Name | Sub_Category | total_revenue | total_profit | total_quantity | avg_discount |
|---|---|---|---|---|---|---|
| FUR-TA-10000198 | Chromcraft Bull-Nose Wood Oval Conference Tables & Bases | Tables | 46612.91 | -3107.53 | 24 | 0.35 |
| FUR-TA-10001889 | Bush Advantage Collection Racetrack Conference Table | Tables | 17821.06 | -2545.26 | 25 | 0.41 |
| FUR-BO-10004834 | Riverside Palais Royal Lawyers Bookcase, Royale Cherry Finish | Bookcases | 17309.85 | -1982.21 | 11 | 0.41 |
| FUR-TA-10001950 | Balt Solid Wood Round Tables | Tables | 10447.87 | -1522.53 | 11 | 0.40 |
| FUR-TA-10002958 | Bevis Oval Conference Table, Walnut | Tables | 13095.98 | -1344.05 | 25 | 0.37 |
| FUR-TA-10004154 | Riverside Furniture Oval Coffee Table, Oval End Table, End Table with Drawer | Tables | 23788.47 | -1210.51 | 24 | 0.38 |
| FUR-TA-10003473 | Bretford Rectangular Conference Table Tops | Tables | 12191.31 | -1158.48 | 19 | 0.39 |
| FUR-TA-10004289 | BoxOffice By Design Rectangular and Half-Moon Meeting Room Tables | Tables | 5280.62 | -1148.44 | 15 | 0.48 |
| FUR-TA-10004256 | Bretford “Just In Time” Height-Adjustable Multi-Task Work Tables | Tables | 14830.22 | -1081.07 | 13 | 0.30 |
| FUR-BO-10001972 | O’Sullivan 4-Shelf Bookcase in Odessa Pine | Bookcases | 2702.69 | -1047.69 | 22 | 0.56 |
Time Trend Analysis
#summarize orders
orders_summary_final <- sales_calc %>%
group_by(`Order_ID`, `Order_Date`) %>%
summarise(
total_sales_rev = sum(item_revenue),
total_profit = sum(Profit),
.groups = 'drop'
)
#monthly data
monthly_data <- orders_summary_final %>%
mutate(
#extract the start date of the month
order_year_month = lubridate::floor_date(`Order_Date`, "month")
) %>%
group_by(order_year_month) %>%
summarise(
monthly_sales_rev = sum(total_sales_rev),
monthly_profit = sum(total_profit),
monthly_orders = n(),
.groups = 'drop'
)
knitr::kable(head(monthly_data), caption = "First rows of the data aggregated by month", digits = 2)
| order_year_month | monthly_sales_rev | monthly_profit | monthly_orders |
|---|---|---|---|
| 2014-01-01 | 71137.39 | 2450.19 | 32 |
| 2014-02-01 | 18847.85 | 862.31 | 28 |
| 2014-03-01 | 199717.60 | 498.73 | 71 |
| 2014-04-01 | 129873.56 | 3488.84 | 66 |
| 2014-05-01 | 96039.71 | 2738.71 | 69 |
| 2014-06-01 | 151510.92 | 4976.52 | 66 |
monthly_data %>%
ggplot(aes(x = order_year_month, y = monthly_profit)) +
geom_line(color = "#0072B2", size = 1) +
geom_point(color = "#0072B2") +
geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +
labs(
title = "Monthly Revenue Trend",
x = "Month and Year",
y = "Total Profit(USD)"
) +
theme_minimal() +
scale_y_continuous(labels = scales::dollar) +
scale_x_datetime( #adding months to the plot
date_labels = "%b %Y", #month+year
date_breaks = "3 months"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))