1. Introduction

This R Markdown notebook presents a full analysis of the Superstore dataset, focusing on sales, profit, discounts, customer behavior, and regional performance.

Tools used: Google Sheets, BigQuery (SQL), R, Tableau.

2. Load Libraries

install.packages("bigrquery")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readr)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(bigrquery)
superstore <- bq_table_download(
  bq_table("studied-flow-472916-a9", "sample_superstore", "superstore")
)
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The bigrquery package is using a cached token for
##   'israeloyetoro11@gmail.com'.
## Auto-refreshing stale OAuth token.

3. Load Dataset

superstore <- read_csv("superstore.csv") %>% clean_names()
## Rows: 9994 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Order ID, Order Date, Ship Date, Ship Mode, Customer ID, Customer ...
## dbl  (5): Row ID, Sales, Quantity, Discount, Profit
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(superstore)
## # A tibble: 6 × 21
##   row_id order_id       order_date ship_date ship_mode customer_id customer_name
##    <dbl> <chr>          <chr>      <chr>     <chr>     <chr>       <chr>        
## 1      1 CA-2016-152156 11/8/2016  11/11/20… Second C… CG-12520    Claire Gute  
## 2      2 CA-2016-152156 11/8/2016  11/11/20… Second C… CG-12520    Claire Gute  
## 3      3 CA-2016-138688 6/12/2016  6/16/2016 Second C… DV-13045    Darrin Van H…
## 4      4 US-2015-108966 10/11/2015 10/18/20… Standard… SO-20335    Sean O'Donne…
## 5      5 US-2015-108966 10/11/2015 10/18/20… Standard… SO-20335    Sean O'Donne…
## 6      6 CA-2014-115812 6/9/2014   6/14/2014 Standard… BH-11710    Brosina Hoff…
## # ℹ 14 more variables: segment <chr>, country <chr>, city <chr>, state <chr>,
## #   postal_code <chr>, region <chr>, product_id <chr>, category <chr>,
## #   sub_category <chr>, product_name <chr>, sales <dbl>, quantity <dbl>,
## #   discount <dbl>, profit <dbl>

4. Data Cleaning & Preparation

superstore <- superstore %>%
  mutate(order_date = mdy(order_date),
         ship_date = mdy(ship_date),
         year = year(order_date),
         month = month(order_date, label = TRUE),
         profit_ratio = profit / sales)

5. Exploratory Data Analysis

5.1 Sales & Profit Summary

superstore %>% summarise(
  total_sales = sum(sales),
  total_profit = sum(profit),
  avg_discount = mean(discount)
)
## # A tibble: 1 × 3
##   total_sales total_profit avg_discount
##         <dbl>        <dbl>        <dbl>
## 1    2297201.      286397.        0.156

5.2 Category-Level Insights

superstore %>% 
  group_by(category) %>% 
  summarise(sales = sum(sales), profit = sum(profit)) %>%
  arrange(desc(profit))
## # A tibble: 3 × 3
##   category          sales  profit
##   <chr>             <dbl>   <dbl>
## 1 Technology      836154. 145455.
## 2 Office Supplies 719047. 122491.
## 3 Furniture       742000.  18451.

5.3 Regional Performance

superstore %>% 
  group_by(region) %>%
  summarise(sales = sum(sales), profit = sum(profit)) %>%
  arrange(desc(profit))
## # A tibble: 4 × 3
##   region    sales  profit
##   <chr>     <dbl>   <dbl>
## 1 West    725458. 108418.
## 2 East    678781.  91523.
## 3 South   391722.  46749.
## 4 Central 501240.  39706.

6. Visualizations

Sales Trend Over Time

superstore %>%
  group_by(year, month) %>%
  summarise(sales = sum(sales)) %>%
  ggplot(aes(x = interaction(year, month), y = sales, group = year)) +
  geom_line() +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(title = "Monthly Sales Trend", x = "Year-Month", y = "Sales")
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Profit vs Discount

ggplot(superstore, aes(x = discount, y = profit)) +
  geom_point(alpha = 0.3) +
  geom_smooth() +
  labs(title = "Discount Impact on Profit")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

7. Deep-Dive Insights

Top 10 Most Profitable Products

superstore %>%
  group_by(product_name) %>%
  summarise(total_profit = sum(profit)) %>%
  arrange(desc(total_profit)) %>%
  head(10)
## # A tibble: 10 × 2
##    product_name                                                     total_profit
##    <chr>                                                                   <dbl>
##  1 "Canon imageCLASS 2200 Advanced Copier"                                25200.
##  2 "Fellowes PB500 Electric Punch Plastic Comb Binding Machine wit…        7753.
##  3 "Hewlett Packard LaserJet 3310 Copier"                                  6984.
##  4 "Canon PC1060 Personal Laser Copier"                                    4571.
##  5 "HP Designjet T520 Inkjet Large Format Printer - 24\" Color"            4095.
##  6 "Ativa V4110MDD Micro-Cut Shredder"                                     3773.
##  7 "3D Systems Cube Printer, 2nd Generation, Magenta"                      3718.
##  8 "Plantronics Savi W720 Multi-Device Wireless Headset System"            3696.
##  9 "Ibico EPK-21 Electric Binding System"                                  3345.
## 10 "Zebra ZM400 Thermal Label Printer"                                     3344.

Bottom 10 Loss-Making Products

superstore %>%
  group_by(product_name) %>%
  summarise(total_profit = sum(profit)) %>%
  arrange(total_profit) %>%
  head(10)
## # A tibble: 10 × 2
##    product_name                                                     total_profit
##    <chr>                                                                   <dbl>
##  1 Cubify CubeX 3D Printer Double Head Print                              -8880.
##  2 Lexmark MX611dhe Monochrome Laser Printer                              -4590.
##  3 Cubify CubeX 3D Printer Triple Head Print                              -3840.
##  4 Chromcraft Bull-Nose Wood Oval Conference Tables & Bases               -2876.
##  5 Bush Advantage Collection Racetrack Conference Table                   -1934.
##  6 GBC DocuBind P400 Electric Binding System                              -1878.
##  7 Cisco TelePresence System EX90 Videoconferencing Unit                  -1811.
##  8 Martin Yale Chadless Opener Electric Letter Opener                     -1299.
##  9 Balt Solid Wood Round Tables                                           -1201.
## 10 BoxOffice By Design Rectangular and Half-Moon Meeting Room Tabl…       -1148.

8. Recommendations

9. Limitations

10. Conclusion

This analysis reveals clear opportunities to optimize pricing, inventory, and regional strategy. Tableau dashboards complement this notebook with interactive visuals.


End of R Markdown Notebook

SQL Queries – BigQuery

Total Sales and Profit

{sql total_sales_profit, connection=bigquery}
SELECT
  SUM(Sales) AS total_sales,
  SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`;

Profit by Category

{sql profit_by_category, connection=bigquery}
SELECT
  Category,
  SUM(Sales) AS total_sales,
  SUM(Quantity) AS total_quantity,
  SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY Category
ORDER BY total_profit DESC;

Discount Impact on Profit

{sql discount_profit, connection=bigquery}
SELECT
  Discount,
  AVG(Profit) AS avg_profit,
  AVG(Discount) AS avg_discount
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY Discount
ORDER BY Discount;

Top 10 Most Profitable Products

{sql top_products, connection=bigquery}
SELECT
  ProductName,
  SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY ProductName
ORDER BY total_profit DESC
LIMIT 10;

Bottom 10 Loss-Making Products

{sql bottom_products, connection=bigquery}
SELECT
  ProductName,
  SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY ProductName
ORDER BY total_profit ASC
LIMIT 10;

Regional Sales & Profit Summary

{sql regional_summary, connection=bigquery}
SELECT
  Region,
  SUM(Sales) AS sales,
  SUM(Profit) AS profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY Region
ORDER BY sales DESC;