ETW2001 - Assignment 1

Author

Chew Jian Hong

Setup

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.5.3
Warning: package 'ggplot2' was built under R version 4.5.3
Warning: package 'tibble' was built under R version 4.5.3
Warning: package 'tidyr' was built under R version 4.5.3
Warning: package 'readr' was built under R version 4.5.3
Warning: package 'purrr' was built under R version 4.5.3
Warning: package 'dplyr' was built under R version 4.5.3
Warning: package 'stringr' was built under R version 4.5.3
Warning: package 'forcats' was built under R version 4.5.3
Warning: package 'lubridate' was built under R version 4.5.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.2.0
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(dplyr)
library(readxl)
Warning: package 'readxl' was built under R version 4.5.3
# Load dataset (replace with correct path)
superstore <- read_excel("superstore (1).xlsx")

Question 1

# Filter countries with >200 records
# Keep rows with shipping cost above country average
# Create profit/shipping ratio and rank top 15

superstore %>%
  group_by(Country) %>%
  filter(n() > 200) %>%
  mutate(avg_ship = mean(Shipping.Cost, na.rm=TRUE)) %>%
  filter(Shipping.Cost > avg_ship) %>%
  mutate(profit_per_ship = Profit / Shipping.Cost) %>%
  arrange(desc(profit_per_ship)) %>%
  select(Customer.Name, Product.Name, Country, Sales, Profit, Shipping.Cost, profit_per_ship) %>%
  head(15)
# A tibble: 15 × 7
# Groups:   Country [9]
   Customer.Name Product.Name Country Sales Profit Shipping.Cost profit_per_ship
   <chr>         <chr>        <chr>   <dbl>  <dbl>         <dbl>           <dbl>
 1 Tom Ashbrook  Canon image… United… 11200  3920.          46.0            85.3
 2 Pauline John… Barricks Tr… Ukraine  4487  1436.          50.4            28.5
 3 Christopher … Canon PC106… United…  2800   945.          35.0            27.0
 4 Tamara Chand  Canon image… United… 17500  8400.         349.             24.1
 5 Justin Hirsh  KitchenAid … Iraq     3172  1237.          53.6            23.1
 6 Maribeth Sch… Samsung Sma… China    2545  1069.          52.3            20.4
 7 Michael Moore Cisco Smart… Germany  2617  1151.          57.0            20.2
 8 Kelly Collis… Logitech P7… United…  3347   636.          32.2            19.7
 9 Ben Peterman  Dania Class… Austra…  2596   923.          47.8            19.3
10 Brad Thomas   HP Fax Mach… Saudi …  1800   720.          38.5            18.7
11 Shirley Dani… Fellowes PB… United…  3813  1906.         103.             18.6
12 Katherine No… HP Fax and … United…  1553   776.          42.6            18.2
13 Robert Marley Office Star… Colomb…   930   456.          26.3            17.3
14 Adam Bellava… GBC DocuBin… United…  4355  1415.          82.7            17.1
15 Keith Herrera Sharp AL-15… United…  1200   435.          26.0            16.7

Question 2

# Market-year average profit per order (exclude zero/negative sales)
# Keep groups with >=50 orders
# Identify markets never below dataset median profit
# Show top 10 by average sales

median_profit <- median(superstore$Profit, na.rm=TRUE)

superstore %>%
  filter(Sales > 0) %>%
  group_by(Market, Year) %>%
  filter(n() >= 50) %>%
  summarise(avg_profit = mean(Profit), avg_sales = mean(Sales), .groups="drop") %>%
  group_by(Market) %>%
  filter(all(avg_profit >= median_profit)) %>%
  arrange(desc(avg_sales)) %>%
  head(10)
# A tibble: 10 × 4
# Groups:   Market [3]
   Market  Year avg_profit avg_sales
   <chr>  <dbl>      <dbl>     <dbl>
 1 APAC    2013       42.6      337.
 2 APAC    2011       42.8      329.
 3 APAC    2012       37.6      321.
 4 APAC    2014       37.0      319.
 5 EU      2013       38.4      297.
 6 EU      2014       36.5      295.
 7 EU      2012       37.2      291.
 8 EU      2011       37.2      289.
 9 US      2011       24.9      243.
10 US      2013       31.7      236.

Question 3

# Typical discount per sub-category
# Flag suspiciously high discounts
# Label as loss-making vs non-loss-making
# Count by market & segment, keep loss share >40%

superstore %>%
  group_by(Sub.Category) %>%
  mutate(median_disc = median(Discount, na.rm=TRUE)) %>%
  filter(Discount > median_disc) %>%
  mutate(label = ifelse(Profit < 0, "loss-making", "non-loss-making")) %>%
  group_by(Market, Segment, label) %>%
  summarise(count = n(), .groups="drop") %>%
  group_by(Market, Segment) %>%
  mutate(loss_share = count[label=="loss-making"] / sum(count)) %>%
  filter(loss_share > 0.4) %>%
  arrange(desc(loss_share))
# A tibble: 30 × 5
# Groups:   Market, Segment [15]
   Market Segment     label           count loss_share
   <chr>  <chr>       <chr>           <int>      <dbl>
 1 EMEA   Consumer    loss-making       810      0.999
 2 EMEA   Consumer    non-loss-making     1      0.999
 3 EMEA   Corporate   loss-making       503      0.998
 4 EMEA   Corporate   non-loss-making     1      0.998
 5 EMEA   Home Office loss-making       293      0.997
 6 EMEA   Home Office non-loss-making     1      0.997
 7 Africa Home Office loss-making       191      0.979
 8 Africa Home Office non-loss-making     4      0.979
 9 Africa Corporate   loss-making       307      0.975
10 Africa Corporate   non-loss-making     8      0.975
# ℹ 20 more rows

Question 4

# Running total of profit, skip quantity <2
# Stop when >25,000
# Compare original order vs sorted by sales

running_total <- 0
stop_row <- NA

for(i in 1:nrow(superstore)){
  if(superstore$Quantity[i] >= 2){
    running_total <- running_total + superstore$Profit[i]
    if(running_total > 25000){
      stop_row <- i
      break
    }
  }
}

list(row=stop_row,
     order_id=superstore$Order.ID[stop_row],
     customer=superstore$Customer.Name[stop_row],
     total=running_total)
$row
[1] 846

$order_id
[1] "CA-2014-120936"

$customer
[1] "Christine Abelman"

$total
[1] 25091.81
# Repeat after sorting by sales

Question 5

# Products sold in >=4 countries
# Compare avg profit above vs below median discount
# Gap column, keep products where high-discount performs better

superstore %>%
  group_by(Product.Name) %>%
  filter(n_distinct(Country) >= 4) %>%
  mutate(median_disc = median(Discount, na.rm=TRUE)) %>%
  summarise(
    high_disc_profit = mean(Profit[Discount > median_disc], na.rm=TRUE),
    low_disc_profit = mean(Profit[Discount <= median_disc], na.rm=TRUE),
    gap = high_disc_profit - low_disc_profit,
    .groups="drop"
  ) %>%
  filter(gap > 0) %>%
  arrange(desc(gap)) %>%
  head(5)
# A tibble: 5 × 4
  Product.Name                      high_disc_profit low_disc_profit   gap
  <chr>                                        <dbl>           <dbl> <dbl>
1 KitchenAid Microwave, Black                   484.            79.5  405.
2 Cuisinart Refrigerator, White                 351.           148.   203.
3 Hoover Refrigerator, Black                    331.           142.   189.
4 Lesro Training Table, Rectangular            -428.          -575.   148.
5 KitchenAid Stove, Silver                      541.           413.   128.

Question 6

# City share of market sales
# Find dominant city per segment
# Keep cases >1/3 share

superstore %>%
  group_by(Market, City) %>%
  summarise(city_sales = sum(Sales), .groups="drop") %>%
  group_by(Market) %>%
  mutate(market_sales = sum(city_sales),
         share = city_sales / market_sales) %>%
  slice_max(order_by = share, n = 1) %>%
  filter(share > 1/3)
# A tibble: 0 × 5
# Groups:   Market [0]
# ℹ 5 variables: Market <chr>, City <chr>, city_sales <dbl>,
#   market_sales <dbl>, share <dbl>

Question 7

# Custom score penalizing discount + shipping
# Apply only to Q4 orders
# Top product per region-year, find most frequent winner

superstore %>%
  filter(month(Order.Date) %in% 10:12) %>%
  mutate(score = Profit - (Discount*Sales) - Shipping.Cost) %>%
  group_by(Region, Year, Product.Name) %>%
  summarise(avg_score = mean(score), .groups="drop") %>%
  group_by(Region, Year) %>%
  slice_max(order_by=avg_score, n=1) %>%
  group_by(Region, Product.Name) %>%
  summarise(win_count = n(), .groups="drop") %>%
  slice_max(order_by=win_count, n=1)
# A tibble: 1 × 3
  Region    Product.Name                        win_count
  <chr>     <chr>                                   <int>
1 Caribbean Dania Classic Bookcase, Traditional         2

Question 8

# Compare ship date gaps by priority
# Find cases where lower urgency ships faster

superstore %>%
  mutate(days_gap = as.numeric(difftime(Ship.Date, Order.Date, units="days"))) %>%
  filter(!is.na(days_gap), days_gap >= 0) %>%
  group_by(Ship.Mode, Order.Priority) %>%
  summarise(avg_gap = mean(days_gap), .groups="drop") %>%
  group_by(Ship.Mode) %>%
  arrange(avg_gap) %>%
  mutate(surprising = avg_gap < lag(avg_gap)) %>%
  filter(surprising==TRUE) %>%
  arrange(desc(avg_gap))
# A tibble: 0 × 4
# Groups:   Ship.Mode [0]
# ℹ 4 variables: Ship.Mode <chr>, Order.Priority <chr>, avg_gap <dbl>,
#   surprising <lgl>

Question 9

# Week-by-week cumulative sales per year
# Skip weeks with negative profit
# Stop when >20% of valid yearly sales

superstore %>%
  mutate(Week.Number = week(Order.Date),
         Year = year(Order.Date)) %>%
  group_by(Year, Week.Number) %>%
  summarise(week_sales = sum(Sales),
            week_profit = sum(Profit),
            .groups="drop") %>%
  group_by(Year) %>%
  mutate(valid_sales = sum(week_sales[week_profit >= 0]),
         cum_sales = cumsum(ifelse(week_profit >= 0, week_sales, 0)),
         stop_flag = cum_sales > 0.2 * valid_sales) %>%
  filter(stop_flag) %>%
  slice_min(order_by = Week.Number, n = 1)
# A tibble: 4 × 7
# Groups:   Year [4]
   Year Week.Number week_sales week_profit valid_sales cum_sales stop_flag
  <dbl>       <dbl>      <dbl>       <dbl>       <dbl>     <dbl> <lgl>    
1  2011          17      23470       2518.     2236113    451214 TRUE     
2  2012          18      30022       3664.     2643956    550782 TRUE     
3  2013          16      42397       7398.     3405860    694211 TRUE     
4  2014          16      59337       2950.     4300041    873748 TRUE     

Question 10

# State-subcategory combos with >=30 rows
# Profitability metric vs dataset average
# Shipping per unit metric vs dataset average
# Rank underperformers per region

avg_profit_ratio <- mean(superstore$Profit/superstore$Sales, na.rm=TRUE)
avg_ship_unit <- mean(superstore$Shipping.Cost/superstore$Quantity, na.rm=TRUE)

superstore %>%
  group_by(State, Sub.Category, Region) %>%
  filter(n() >= 30) %>%
  summarise(
    profit_ratio = mean(Profit/Sales, na.rm=TRUE),
    ship_unit = mean(Shipping.Cost/Quantity, na.rm=TRUE),
    .groups="drop"
  ) %>%
  filter(profit_ratio < avg_profit_ratio, ship_unit > avg_ship_unit) %>%
  group_by(Region) %>%
  arrange(profit_ratio) %>%
  slice_head(n=3)
# A tibble: 0 × 5
# Groups:   Region [0]
# ℹ 5 variables: State <chr>, Sub.Category <chr>, Region <chr>,
#   profit_ratio <dbl>, ship_unit <dbl>