This R Markdown notebook presents a full analysis of the Superstore dataset, focusing on sales, profit, discounts, customer behavior, and regional performance.
Tools used: Google Sheets, BigQuery (SQL), R, Tableau.
install.packages("bigrquery")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(readr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(bigrquery)
superstore <- bq_table_download(
bq_table("studied-flow-472916-a9", "sample_superstore", "superstore")
)
## ! Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to
## the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The bigrquery package is using a cached token for
## 'israeloyetoro11@gmail.com'.
## Auto-refreshing stale OAuth token.
superstore <- read_csv("superstore.csv") %>% clean_names()
## Rows: 9994 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): Order ID, Order Date, Ship Date, Ship Mode, Customer ID, Customer ...
## dbl (5): Row ID, Sales, Quantity, Discount, Profit
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(superstore)
## # A tibble: 6 × 21
## row_id order_id order_date ship_date ship_mode customer_id customer_name
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 CA-2016-152156 11/8/2016 11/11/20… Second C… CG-12520 Claire Gute
## 2 2 CA-2016-152156 11/8/2016 11/11/20… Second C… CG-12520 Claire Gute
## 3 3 CA-2016-138688 6/12/2016 6/16/2016 Second C… DV-13045 Darrin Van H…
## 4 4 US-2015-108966 10/11/2015 10/18/20… Standard… SO-20335 Sean O'Donne…
## 5 5 US-2015-108966 10/11/2015 10/18/20… Standard… SO-20335 Sean O'Donne…
## 6 6 CA-2014-115812 6/9/2014 6/14/2014 Standard… BH-11710 Brosina Hoff…
## # ℹ 14 more variables: segment <chr>, country <chr>, city <chr>, state <chr>,
## # postal_code <chr>, region <chr>, product_id <chr>, category <chr>,
## # sub_category <chr>, product_name <chr>, sales <dbl>, quantity <dbl>,
## # discount <dbl>, profit <dbl>
superstore <- superstore %>%
mutate(order_date = mdy(order_date),
ship_date = mdy(ship_date),
year = year(order_date),
month = month(order_date, label = TRUE),
profit_ratio = profit / sales)
superstore %>% summarise(
total_sales = sum(sales),
total_profit = sum(profit),
avg_discount = mean(discount)
)
## # A tibble: 1 × 3
## total_sales total_profit avg_discount
## <dbl> <dbl> <dbl>
## 1 2297201. 286397. 0.156
superstore %>%
group_by(category) %>%
summarise(sales = sum(sales), profit = sum(profit)) %>%
arrange(desc(profit))
## # A tibble: 3 × 3
## category sales profit
## <chr> <dbl> <dbl>
## 1 Technology 836154. 145455.
## 2 Office Supplies 719047. 122491.
## 3 Furniture 742000. 18451.
superstore %>%
group_by(region) %>%
summarise(sales = sum(sales), profit = sum(profit)) %>%
arrange(desc(profit))
## # A tibble: 4 × 3
## region sales profit
## <chr> <dbl> <dbl>
## 1 West 725458. 108418.
## 2 East 678781. 91523.
## 3 South 391722. 46749.
## 4 Central 501240. 39706.
superstore %>%
group_by(year, month) %>%
summarise(sales = sum(sales)) %>%
ggplot(aes(x = interaction(year, month), y = sales, group = year)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Monthly Sales Trend", x = "Year-Month", y = "Sales")
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(superstore, aes(x = discount, y = profit)) +
geom_point(alpha = 0.3) +
geom_smooth() +
labs(title = "Discount Impact on Profit")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
superstore %>%
group_by(product_name) %>%
summarise(total_profit = sum(profit)) %>%
arrange(desc(total_profit)) %>%
head(10)
## # A tibble: 10 × 2
## product_name total_profit
## <chr> <dbl>
## 1 "Canon imageCLASS 2200 Advanced Copier" 25200.
## 2 "Fellowes PB500 Electric Punch Plastic Comb Binding Machine wit… 7753.
## 3 "Hewlett Packard LaserJet 3310 Copier" 6984.
## 4 "Canon PC1060 Personal Laser Copier" 4571.
## 5 "HP Designjet T520 Inkjet Large Format Printer - 24\" Color" 4095.
## 6 "Ativa V4110MDD Micro-Cut Shredder" 3773.
## 7 "3D Systems Cube Printer, 2nd Generation, Magenta" 3718.
## 8 "Plantronics Savi W720 Multi-Device Wireless Headset System" 3696.
## 9 "Ibico EPK-21 Electric Binding System" 3345.
## 10 "Zebra ZM400 Thermal Label Printer" 3344.
superstore %>%
group_by(product_name) %>%
summarise(total_profit = sum(profit)) %>%
arrange(total_profit) %>%
head(10)
## # A tibble: 10 × 2
## product_name total_profit
## <chr> <dbl>
## 1 Cubify CubeX 3D Printer Double Head Print -8880.
## 2 Lexmark MX611dhe Monochrome Laser Printer -4590.
## 3 Cubify CubeX 3D Printer Triple Head Print -3840.
## 4 Chromcraft Bull-Nose Wood Oval Conference Tables & Bases -2876.
## 5 Bush Advantage Collection Racetrack Conference Table -1934.
## 6 GBC DocuBind P400 Electric Binding System -1878.
## 7 Cisco TelePresence System EX90 Videoconferencing Unit -1811.
## 8 Martin Yale Chadless Opener Electric Letter Opener -1299.
## 9 Balt Solid Wood Round Tables -1201.
## 10 BoxOffice By Design Rectangular and Half-Moon Meeting Room Tabl… -1148.
This analysis reveals clear opportunities to optimize pricing, inventory, and regional strategy. Tableau dashboards complement this notebook with interactive visuals.
End of R Markdown Notebook
{sql total_sales_profit, connection=bigquery}
SELECT
SUM(Sales) AS total_sales,
SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`;
{sql profit_by_category, connection=bigquery}
SELECT
Category,
SUM(Sales) AS total_sales,
SUM(Quantity) AS total_quantity,
SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY Category
ORDER BY total_profit DESC;
{sql discount_profit, connection=bigquery}
SELECT
Discount,
AVG(Profit) AS avg_profit,
AVG(Discount) AS avg_discount
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY Discount
ORDER BY Discount;
{sql top_products, connection=bigquery}
SELECT
ProductName,
SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY ProductName
ORDER BY total_profit DESC
LIMIT 10;
{sql bottom_products, connection=bigquery}
SELECT
ProductName,
SUM(Profit) AS total_profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY ProductName
ORDER BY total_profit ASC
LIMIT 10;
{sql regional_summary, connection=bigquery}
SELECT
Region,
SUM(Sales) AS sales,
SUM(Profit) AS profit
FROM `studied-flow-472916-a9.sample_superstore.superstore`
GROUP BY Region
ORDER BY sales DESC;