OnlineRetail

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(ggplot2)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ MASS::select()  masks dplyr::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

## corrplot 0.95 loaded

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode

library(GGally)
library(ROCR)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(Hmisc)

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(tibble)
library(knitr)
library(here)

## here() starts at C:/Users/aliso/Documents/UTSA/Data Analytics Applications/OnlineRetail

library(pROC)

## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(broom)
library(tidyr)

# Read in 2009-2010

year1 <- read_excel("online_retail.xlsx", sheet = "Year 2009-2010")
 
# Read in 2010-2011

year2 <- read_excel("online_retail.xlsx", sheet = "Year 2010-2011")

glimpse(year1)

## Rows: 525,461
## Columns: 8
## $ Invoice       <chr> "489434", "489434", "489434", "489434", "489434", "48943…
## $ StockCode     <chr> "85048", "79323P", "79323W", "22041", "21232", "22064", …
## $ Description   <chr> "15CM CHRISTMAS GLASS BALL 20 LIGHTS", "PINK CHERRY LIGH…
## $ Quantity      <dbl> 12, 12, 12, 48, 24, 24, 24, 10, 12, 12, 24, 12, 10, 18, …
## $ InvoiceDate   <dttm> 2009-12-01 07:45:00, 2009-12-01 07:45:00, 2009-12-01 07…
## $ Price         <dbl> 6.95, 6.75, 6.75, 2.10, 1.25, 1.65, 1.25, 5.95, 2.55, 3.…
## $ `Customer ID` <dbl> 13085, 13085, 13085, 13085, 13085, 13085, 13085, 13085, …
## $ Country       <chr> "United Kingdom", "United Kingdom", "United Kingdom", "U…

glimpse(year2)

## Rows: 541,910
## Columns: 8
## $ Invoice       <chr> "536365", "536365", "536365", "536365", "536365", "53636…
## $ StockCode     <chr> "85123A", "71053", "84406B", "84029G", "84029E", "22752"…
## $ Description   <chr> "WHITE HANGING HEART T-LIGHT HOLDER", "WHITE METAL LANTE…
## $ Quantity      <dbl> 6, 6, 8, 6, 6, 2, 6, 6, 6, 6, 3, 3, 3, 32, 6, 6, 8, 6, 6…
## $ InvoiceDate   <dttm> 2010-12-01 08:26:00, 2010-12-01 08:26:00, 2010-12-01 08…
## $ Price         <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1.85, 4.…
## $ `Customer ID` <dbl> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17850, …
## $ Country       <chr> "United Kingdom", "United Kingdom", "United Kingdom", "U…

anyNA(year1)

## [1] TRUE

anyNA(year2)

## [1] TRUE

na_count_yr1 <- sum(is.na(year1))
print(na_count_yr1)

## [1] 110855

na_count_yr2 <- sum(is.na(year2))
print(na_count_yr2)

## [1] 136534

no_NA_year1 <- na.omit(year1)

no_NA_year2 <- na.omit(year2)

anyNA(no_NA_year1)

## [1] FALSE

anyNA(no_NA_year2)

## [1] FALSE

summary(no_NA_year1)

##    Invoice           StockCode         Description           Quantity       
##  Length:417534      Length:417534      Length:417534      Min.   :-9360.00  
##  Class :character   Class :character   Class :character   1st Qu.:    2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    4.00  
##                                                           Mean   :   12.76  
##                                                           3rd Qu.:   12.00  
##                                                           Max.   :19152.00  
##   InvoiceDate                      Price            Customer ID   
##  Min.   :2009-12-01 07:45:00   Min.   :    0.000   Min.   :12346  
##  1st Qu.:2010-03-26 11:26:00   1st Qu.:    1.250   1st Qu.:13983  
##  Median :2010-07-08 19:12:00   Median :    1.950   Median :15311  
##  Mean   :2010-07-01 01:12:19   Mean   :    3.888   Mean   :15361  
##  3rd Qu.:2010-10-14 14:08:00   3rd Qu.:    3.750   3rd Qu.:16799  
##  Max.   :2010-12-09 20:01:00   Max.   :25111.090   Max.   :18287  
##    Country         
##  Length:417534     
##  Class :character  
##  Mode  :character  
##                    
##                    
##

summary(no_NA_year2)

##    Invoice           StockCode         Description           Quantity        
##  Length:406830      Length:406830      Length:406830      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     5.00  
##                                                           Mean   :    12.06  
##                                                           3rd Qu.:    12.00  
##                                                           Max.   : 80995.00  
##   InvoiceDate                      Price            Customer ID   
##  Min.   :2010-12-01 08:26:00   Min.   :    0.000   Min.   :12346  
##  1st Qu.:2011-04-06 15:02:00   1st Qu.:    1.250   1st Qu.:13953  
##  Median :2011-07-31 11:48:00   Median :    1.950   Median :15152  
##  Mean   :2011-07-10 16:31:30   Mean   :    3.461   Mean   :15288  
##  3rd Qu.:2011-10-20 13:06:00   3rd Qu.:    3.750   3rd Qu.:16791  
##  Max.   :2011-12-09 12:50:00   Max.   :38970.000   Max.   :18287  
##    Country         
##  Length:406830     
##  Class :character  
##  Mode  :character  
##                    
##                    
##

no_NA_df <- bind_rows(no_NA_year1, no_NA_year2)

# Rename column in position 7 to 'CustomerID'
colnames(no_NA_df)[7] <- 'CustomerID'

head(no_NA_df)

## # A tibble: 6 × 8
##   Invoice StockCode Description    Quantity InvoiceDate         Price CustomerID
##   <chr>   <chr>     <chr>             <dbl> <dttm>              <dbl>      <dbl>
## 1 489434  85048     "15CM CHRISTM…       12 2009-12-01 07:45:00  6.95      13085
## 2 489434  79323P    "PINK CHERRY …       12 2009-12-01 07:45:00  6.75      13085
## 3 489434  79323W    "WHITE CHERRY…       12 2009-12-01 07:45:00  6.75      13085
## 4 489434  22041     "RECORD FRAME…       48 2009-12-01 07:45:00  2.1       13085
## 5 489434  21232     "STRAWBERRY C…       24 2009-12-01 07:45:00  1.25      13085
## 6 489434  22064     "PINK DOUGHNU…       24 2009-12-01 07:45:00  1.65      13085
## # ℹ 1 more variable: Country <chr>

library(dplyr)
library(lubridate)
library(ggplot2)

#Top selling products for 2009-2011

no_NA_revenue<-no_NA_df |> group_by(StockCode, Description)|>summarise(sales=sum(Quantity*Price))|>ungroup()|> arrange(desc(sales))

## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.

no_NA_revenue

## # A tibble: 5,302 × 3
##    StockCode Description                          sales
##    <chr>     <chr>                                <dbl>
##  1 22423     REGENCY CAKESTAND 3 TIER           269737.
##  2 85123A    WHITE HANGING HEART T-LIGHT HOLDER 242701.
##  3 85099B    JUMBO BAG RED RETROSPOT            134845.
##  4 84879     ASSORTED COLOUR BIRD ORNAMENT      126354.
##  5 POST      POSTAGE                            112249.
##  6 47566     PARTY BUNTING                      102686.
##  7 22086     PAPER CHAIN KIT 50'S CHRISTMAS      78367.
##  8 79321     CHILLI LIGHTS                       72229.
##  9 21137     BLACK RECORD COVER FRAME            67127.
## 10 85099F    JUMBO BAG STRAWBERRY                64089.
## # ℹ 5,292 more rows

#Look for time series trends

pacman::p_load("pacman","tidyverse", "openxlsx", "forecast", "modeltime", "parsnip", "rsample", "timetk", "xts", "tidyquant", "feasts", "prophet")

no_NA_dlyrevenue<-no_NA_df |> group_by(InvoiceDate)|>summarise(dlysales=sum(Quantity*Price))|>ungroup()|>arrange(desc(dlysales)) 

no_NA_dlyrevenue

## # A tibble: 41,439 × 2
##    InvoiceDate         dlysales
##    <dttm>                 <dbl>
##  1 2011-12-09 09:15:00  168470.
##  2 2011-01-18 10:01:00   77184.
##  3 2010-01-07 12:34:00   44052.
##  4 2011-06-10 15:28:00   39597.
##  5 2010-09-27 16:59:00   33168.
##  6 2010-12-07 16:43:00   32562.
##  7 2011-09-20 11:05:00   31698.
##  8 2010-10-14 09:46:00   26007.
##  9 2011-06-15 13:37:00   23427.
## 10 2010-07-15 15:29:00   22863.
## # ℹ 41,429 more rows

no_NA_top10revenue<-no_NA_df |>  filter(StockCode != "POST") |>  group_by(StockCode, Description)|>summarise(sales=sum(Quantity*Price))|>ungroup()|> arrange(desc(sales))|> slice_head(n=10)

## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.

no_NA_top10revenue

## # A tibble: 10 × 3
##    StockCode Description                          sales
##    <chr>     <chr>                                <dbl>
##  1 22423     REGENCY CAKESTAND 3 TIER           269737.
##  2 85123A    WHITE HANGING HEART T-LIGHT HOLDER 242701.
##  3 85099B    JUMBO BAG RED RETROSPOT            134845.
##  4 84879     ASSORTED COLOUR BIRD ORNAMENT      126354.
##  5 47566     PARTY BUNTING                      102686.
##  6 22086     PAPER CHAIN KIT 50'S CHRISTMAS      78367.
##  7 79321     CHILLI LIGHTS                       72229.
##  8 21137     BLACK RECORD COVER FRAME            67127.
##  9 85099F    JUMBO BAG STRAWBERRY                64089.
## 10 82484     WOOD BLACK BOARD ANT WHITE FINISH   60188.

ggplot(no_NA_top10revenue, aes(x = reorder(Description, sales), y = sales)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # To flip the bars horizontally for better readability
  labs(title = "Top Selling Products (2009-2011)", 
       x = "Product", 
       y = "Total Sales (in $10k)") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

ggplot(data=no_NA_dlyrevenue, aes(x=InvoiceDate, y=dlysales)) + geom_line() + geom_ma(ma_fun = SMA, color = "orange", n = 4) + geom_ma(ma_fun = SMA, n=5, color = "lightblue")+ theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

no_NA_df$sales <- no_NA_df$Quantity*no_NA_df$Price

head(no_NA_df)

## # A tibble: 6 × 9
##   Invoice StockCode Description    Quantity InvoiceDate         Price CustomerID
##   <chr>   <chr>     <chr>             <dbl> <dttm>              <dbl>      <dbl>
## 1 489434  85048     "15CM CHRISTM…       12 2009-12-01 07:45:00  6.95      13085
## 2 489434  79323P    "PINK CHERRY …       12 2009-12-01 07:45:00  6.75      13085
## 3 489434  79323W    "WHITE CHERRY…       12 2009-12-01 07:45:00  6.75      13085
## 4 489434  22041     "RECORD FRAME…       48 2009-12-01 07:45:00  2.1       13085
## 5 489434  21232     "STRAWBERRY C…       24 2009-12-01 07:45:00  1.25      13085
## 6 489434  22064     "PINK DOUGHNU…       24 2009-12-01 07:45:00  1.65      13085
## # ℹ 2 more variables: Country <chr>, sales <dbl>

ggplot(data=no_NA_df, aes(x=InvoiceDate, y=sales)) + geom_line() + geom_ma(ma_fun = SMA, color = "orange", n = 4) + geom_ma(ma_fun = SMA, n=5, color = "lightblue")+ theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(dplyr)

pacman::p_load(pacman,
  tidyverse, openxlsx, forecast, modeltime, parsnip, rsample, timetk, xts, tidyquant, feasts, prophet)

d3 <- no_NA_df %>%
  mutate(Date = as.Date(InvoiceDate, format = "%Y-%m-%d")) %>%
  group_by(Date) %>%
  summarise(Daily_Sales = sum(sales))


head(d3)

## # A tibble: 6 × 2
##   Date       Daily_Sales
##   <date>           <dbl>
## 1 2009-12-01      42708.
## 2 2009-12-02      52578.
## 3 2009-12-03      61534.
## 4 2009-12-04      33687.
## 5 2009-12-05       9803.
## 6 2009-12-06      24284.

ggplot(d3, aes(x = Date, y = Daily_Sales)) +
  geom_line(color = "blue") +
  labs(title = "Daily Sales Trend", x = "Date", y = "Sales") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

sales_ts <- ts(d3$Daily_Sales, start = c(year(min(d3$Date)), month(min(d3$Date))), frequency = 365)

# Fit ARIMA model
arima_model <- auto.arima(sales_ts)

# Forecast future demand
forecast_arima <- forecast(arima_model, h = 30)  # Sales forecast for the next 30 days

# Plot the forecast
autoplot(forecast_arima) +
  labs(title = "ARIMA Forecast", x = "Date", y = "Sales")

# Fit ETS model
ets_model <- ets(sales_ts)

## Warning in ets(sales_ts): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.

# Forecast future demand
forecast_ets <- forecast(ets_model, h = 30)

# Plot the forecast
autoplot(forecast_ets) +
  labs(title = "ETS Forecast", x = "Date", y = "Sales")

suppressPackageStartupMessages({
  library(dplyr)
  library(lubridate)
  library(tidyr)
  library(purrr)
  library(forecast)
  library(ggplot2)
  library(readr)
})

# ---------------- USER INPUTS ----------------
exclude_codes <- c("POST")   # e.g., c("22423")
horizon_days  <- 60    # number of future days to forecast
lookback_days_for_top <- 0  # 0 = all history
# --------------------------------------------

# Ensure required columns exist: InvoiceDate, StockCode, Quantity, Price
stopifnot(all(c("InvoiceDate","StockCode","Quantity","Price") %in% names(no_NA_df)))

# 1) Standardize types
df <- no_NA_df %>%
  dplyr::mutate(
    InvoiceDate = as.Date(InvoiceDate),
    StockCode   = as.character(StockCode),
    Quantity    = as.numeric(Quantity),
    Price       = as.numeric(Price)
  ) %>%
  dplyr::filter(!is.na(InvoiceDate), !is.na(StockCode), !is.na(Quantity), !is.na(Price))

# If Description missing, create it (keeps code robust)
if (!"Description" %in% names(df)) {
  df <- df %>% dplyr::mutate(Description = NA_character_)
} else {
  df <- df %>% dplyr::mutate(Description = as.character(Description))
}

# 2) Top-10 products by revenue (optional lookback)
df_top_scope <- if (lookback_days_for_top > 0) {
  cutoff <- max(df$InvoiceDate, na.rm = TRUE) - lookback_days_for_top
  df %>% dplyr::filter(InvoiceDate > cutoff)
} else df

top10_tbl <- df_top_scope %>%
  dplyr::filter(!StockCode %in% exclude_codes) %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(sales = sum(Quantity * Price, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(sales)) %>%
  dplyr::slice_head(n = 10)

top10_ids <- top10_tbl$StockCode
message("Top 10 StockCodes: ", paste(top10_ids, collapse = ", "))

## Top 10 StockCodes: 22423, 85123A, 85099B, 84879, 47566, 22086, 79321, 21137, 85099F, 82484

# 3) Aggregate DAILY demand & fill missing days
daily_raw <- df %>%
  dplyr::filter(StockCode %in% top10_ids) %>%
  dplyr::group_by(StockCode, InvoiceDate) %>%
  dplyr::summarise(units = sum(Quantity), .groups = "drop")

daily <- daily_raw %>%
  dplyr::group_by(StockCode) %>%
  tidyr::complete(InvoiceDate = seq(min(InvoiceDate), max(InvoiceDate), by = "1 day"),
                  fill = list(units = 0)) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(StockCode, InvoiceDate)

# 4) Modeling helpers
safe_fit <- function(y) {
  fit <- tryCatch(
    forecast::auto.arima(y, seasonal = TRUE, stepwise = FALSE, approximation = FALSE),
    error = function(e) NULL
  )
  if (is.null(fit)) forecast::ets(y) else fit
}

to_ts <- function(df_prod) {
  stats::ts(df_prod$units, frequency = 7)  # daily with weekly seasonality
}

# 5) Fit + forecast per product
results <- top10_ids %>%
  purrr::map_dfr(function(pid) {
    d <- daily %>% dplyr::filter(StockCode == pid)

    if (sum(d$units > 0, na.rm = TRUE) < 3) {
      fc_vals <- rep(mean(d$units, na.rm = TRUE), horizon_days)
      tibble::tibble(
        StockCode = pid,
        day_ahead = 1:horizon_days,
        forecast = fc_vals,
        lo95 = NA_real_,
        hi95 = NA_real_
      )
    } else {
      y  <- to_ts(d)
      fit <- safe_fit(y)
      fc  <- forecast::forecast(fit, h = horizon_days)

      tibble::tibble(
        StockCode = pid,
        day_ahead = 1:horizon_days,
        forecast = as.numeric(fc$mean),
        lo95 = as.numeric(fc$lower[, "95%"]),
        hi95 = as.numeric(fc$upper[, "95%"])
      )
    }
  }) %>%
  dplyr::left_join(
    top10_tbl %>% dplyr::select(StockCode, Description),
    by = "StockCode"
  )

# 6) Save outputs
readr::write_csv(results, "top10_daily_forecasts2.csv")
message("Saved: top10_daily_forecasts2.csv")

## Saved: top10_daily_forecasts2.csv

# 7) Plot one product’s history + forecast
plot_one <- function(pid) {
  df_hist <- daily   %>% dplyr::filter(StockCode == pid)
  df_fc   <- results %>% dplyr::filter(StockCode == pid)

  title_txt <- paste0(pid, if (!all(is.na(df_fc$Description))) {
    paste0(" – ", unique(stats::na.omit(df_fc$Description))[1])
  } else "")

  last_hist <- max(df_hist$InvoiceDate, na.rm = TRUE)

 ggplot2::ggplot() +
    # historical demand in blue
    ggplot2::geom_line(data = df_hist, 
                       ggplot2::aes(x = InvoiceDate, y = units), 
                       color = "steelblue", linewidth = 1) +
    # forecast interval in light red
    ggplot2::geom_ribbon(data = df_fc,
                         ggplot2::aes(x = last_hist + day_ahead,
                                      ymin = lo95, ymax = hi95),
                         fill = "salmon", alpha = 0.3) +
    # forecast line in dark red
    ggplot2::geom_line(data = df_fc,
                       ggplot2::aes(x = last_hist + day_ahead, y = forecast),
                       color = "firebrick", linewidth = 1) +
    ggplot2::labs(title = paste("Daily Demand & Forecast:", title_txt),
                  x = "Date", y = "Units") +
    ggplot2::theme_minimal()
}

# Example:
# plot_one(top10_ids[1])

for (pid in top10_ids) {
  print(plot_one(pid))
}

suppressPackageStartupMessages({
  library(dplyr); library(lubridate); library(tidyr)
})

# ---- Choose how to define a "Period" ----
# Set fiscal_start_month = 1 for calendar years (Jan–Dec -> "YYYY-YYYY+1" labels still okay),
# or use e.g. 7 for fiscal (Jul–Jun), 4 for Apr–Mar, etc.
fiscal_start_month <- 7  # << change to 1 for calendar-year style periods

no_NA_df <- no_NA_df %>%
  dplyr::mutate(
    InvoiceDate = as.POSIXct(InvoiceDate),
    Date        = as.Date(InvoiceDate),
    StockCode   = as.character(StockCode),
    Description = as.character(Description),
    Quantity    = as.numeric(Quantity),
    Price       = as.numeric(Price),
    sales       = Quantity * Price
  ) %>%
  # Create Period dynamically from Date and fiscal start
  dplyr::mutate(
    Year  = lubridate::year(Date),
    Month = lubridate::month(Date),
    PeriodStartYear = dplyr::if_else(Month < fiscal_start_month, Year - 1L, Year),
    PeriodEndYear   = PeriodStartYear + 1L,
    Period = paste0(PeriodStartYear, "-", PeriodEndYear)
  )

# Sanity check (optional)
# print(table(no_NA_df$Period, useNA="ifany"))

### 1) Auto-select current & prior periods, compute KPIs (dedup safe)

suppressPackageStartupMessages({
  library(dplyr); library(lubridate); library(tidyr); library(purrr); library(scales)
})

# ---- Pick the latest two observed Periods dynamically ----
periods_found <- no_NA_df %>%
  dplyr::distinct(Period, PeriodStartYear, PeriodEndYear) %>%
  dplyr::arrange(dplyr::desc(PeriodStartYear), dplyr::desc(PeriodEndYear))

stopifnot(nrow(periods_found) >= 1)
current_period <- periods_found$Period[1]
prior_period   <- if (nrow(periods_found) >= 2) periods_found$Period[2] else NA_character_

# ---- Revenue / Units / Returns by Period + SKU + Description ----
rev_by_period <- no_NA_df %>%
  dplyr::group_by(Period, StockCode, Description) %>%
  dplyr::summarise(
    Revenue       = sum(sales, na.rm = TRUE),
    Units_shipped = sum(Quantity[Quantity > 0], na.rm = TRUE),
    Units_return  = sum(abs(Quantity[Quantity < 0]), na.rm = TRUE),
    .groups = "drop"
  )

# ---- Collapse to one row per SKU for each period (dedupe fix) ----
cur <- rev_by_period %>%
  dplyr::filter(Period == current_period) %>%
  dplyr::group_by(StockCode) %>%
  dplyr::summarise(
    # pick representative non-NA Description by highest Revenue
    Description = {
      tmp <- dplyr::arrange(pick(dplyr::everything()), dplyr::desc(Revenue))
      dplyr::first(stats::na.omit(tmp$Description))
    },
    cur_rev     = sum(Revenue, na.rm = TRUE),
    cur_units   = sum(Units_shipped, na.rm = TRUE),
    cur_ret     = sum(Units_return, na.rm = TRUE),
    .groups = "drop"
  )

prior <- rev_by_period %>%
  dplyr::filter(Period == prior_period) %>%
  dplyr::group_by(StockCode) %>%
  dplyr::summarise(
    prior_rev = sum(Revenue, na.rm = TRUE),
    .groups = "drop"
  )

# ---- YoY + ReturnRate ----
yoy_tbl <- cur %>%
  dplyr::left_join(prior, by = "StockCode") %>%
  dplyr::mutate(
    YoY = dplyr::case_when(
      is.na(prior_rev) ~ NA_real_,
      prior_rev == 0 & cur_rev > 0 ~ NA_real_,
      TRUE ~ (cur_rev - prior_rev) / prior_rev
    ),
    ReturnRate = dplyr::case_when(
      is.na(cur_units) | cur_units <= 0 ~ NA_real_,
      TRUE ~ cur_ret / cur_units
    )
  )

# ---- Build helpers for Momentum (current period) & 6M Trend ----
cur_df <- no_NA_df %>% dplyr::filter(Period == current_period)

last_date <- max(cur_df$Date, na.rm = TRUE)
last_complete_month <- lubridate::floor_date(last_date, "month") - lubridate::days(1)
end_3m   <- last_complete_month
start_3m <- lubridate::floor_date(end_3m %m-% months(2), "month")
end_prev <- start_3m - lubridate::days(1)
start_prev <- lubridate::floor_date(end_prev %m-% months(2), "month")

momentum_tbl <- cur_df %>%
  dplyr::filter(Date >= start_prev & Date <= end_3m) %>%
  dplyr::mutate(
    bucket = dplyr::case_when(
      Date >= start_3m & Date <= end_3m ~ "last3",
      Date >= start_prev & Date <= end_prev ~ "prev3",
      TRUE ~ NA_character_
    )
  ) %>%
  dplyr::filter(!is.na(bucket)) %>%
  dplyr::group_by(StockCode, bucket) %>%
  dplyr::summarise(rev = sum(sales, na.rm = TRUE), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = bucket, values_from = rev, values_fill = 0) %>%
  dplyr::mutate(
    Momentum3M = dplyr::if_else(prev3 > 0, (last3 / prev3) - 1, NA_real_),
    MomentumLabel = dplyr::case_when(
      is.na(Momentum3M)  ~ "—",
      Momentum3M > 0.05  ~ "↑ Accelerating",
      Momentum3M < -0.05 ~ "↓ Softening",
      TRUE               ~ "→ Stable"
    )
  ) %>%
  dplyr::select(StockCode, Momentum3M, MomentumLabel)

six_end   <- last_complete_month
six_start <- lubridate::floor_date(six_end %m-% months(5), "month")

monthly_cur <- cur_df %>%
  dplyr::filter(Date >= six_start & Date <= six_end) %>%
  dplyr::mutate(ym = lubridate::floor_date(Date, "month")) %>%
  dplyr::group_by(StockCode, ym) %>%
  dplyr::summarise(rev = sum(sales, na.rm = TRUE), .groups = "drop")

# ---- 6-Month Trend (safe version) ----
trend_tbl <- monthly_cur %>%
  dplyr::group_by(StockCode) %>%
  dplyr::mutate(t = as.numeric(ym)) %>%
  tidyr::nest() %>%
  dplyr::mutate(
    model = purrr::map(data, ~ tryCatch(
      stats::lm(rev ~ t, data = .x),
      error = function(e) NULL
    )),
    slope = purrr::map_dbl(model, ~ {
      if (is.null(.x)) NA_real_ else stats::coef(.x)[["t"]]
    })
  ) %>%
  dplyr::select(StockCode, slope) %>%
  dplyr::ungroup()

# ---- Base for current period ----
base_tbl <- yoy_tbl %>%
  dplyr::left_join(momentum_tbl, by = "StockCode")

### 2) Limit to Top-10 most frequently purchased SKUs (current period) and render tables


suppressPackageStartupMessages({ library(kableExtra); library(scales) })

# ---- Top 10 by purchase frequency (units) in current period ----
top10_freq <- no_NA_df %>%
  dplyr::filter(Period == current_period, Quantity > 0) %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(total_units = sum(Quantity, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(total_units)) %>%
  dplyr::slice_head(n = 10)
top10_ids <- top10_freq$StockCode

base_top  <- base_tbl  %>% dplyr::filter(StockCode %in% top10_ids)
trend_top <- trend_tbl %>% dplyr::filter(StockCode %in% top10_ids)

# ---- Table 6: Product Leaderboard (exact columns) ----
tbl6 <- base_top %>%
  dplyr::arrange(dplyr::desc(cur_rev)) %>%
  dplyr::transmute(
    SKU           = StockCode,
    Description   = dplyr::coalesce(Description, "(No description)"),
    Revenue_num   = dplyr::if_else(is.na(cur_rev),   0, cur_rev),
    Units_num     = dplyr::if_else(is.na(cur_units), 0, cur_units),
    YoY_chr       = dplyr::if_else(is.na(YoY), "—", scales::percent(YoY, accuracy = 0.1)),
    Momentum_chr  = dplyr::coalesce(MomentumLabel, "—"),
    Return_chr    = dplyr::if_else(is.na(ReturnRate), "—", scales::percent(ReturnRate, accuracy = 0.1))
  ) %>%
  dplyr::mutate(
    Revenue = scales::dollar(Revenue_num, accuracy = 1),
    Units   = formatC(Units_num, big.mark = ",", format = "f", digits = 0)
  ) %>%
  dplyr::transmute(
    SKU, Description, Revenue, Units,
    `YoY %` = YoY_chr,
    `3M Momentum` = Momentum_chr,
    `Return Rate` = Return_chr
  ) %>%
  dplyr::mutate(dplyr::across(dplyr::everything(), ~ ifelse(is.na(.), "—", as.character(.))))

tbl6 %>%
  kableExtra::kbl(
    caption = paste0("Table 6. Product Leaderboard — Top 10 Most Frequently Purchased (", current_period, ")"),
    align   = c("l","l","r","r","r","l","r")
  ) %>%
  kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))

Table 6. Product Leaderboard — Top 10 Most Frequently Purchased (2011-2012)
SKU	Description	Revenue	Units	YoY %	3M Momentum	Return Rate
23084	RABBIT NIGHT LIGHT	$46,013	24,484	797.8%	↑ Accelerating	0.4%
85099B	JUMBO BAG RED RETROSPOT	$44,289	24,395	-45.2%	↑ Accelerating	4.0%
85123A	WHITE HANGING HEART T-LIGHT HOLDER	$41,020	14,774	-65.3%	↑ Accelerating	0.3%
22086	PAPER CHAIN KIT 50’S CHRISTMAS	$35,850	13,473	6.4%	↑ Accelerating	3.0%
84879	ASSORTED COLOUR BIRD ORNAMENT	$32,092	20,339	-51.6%	↑ Accelerating	0.2%
22197	POPCORN HOLDER	$22,598	30,309	-22.4%	↑ Accelerating	1.4%
22952	60 CAKE CASES VINTAGE CHRISTMAS	$6,705	13,561	-20.1%	↑ Accelerating	1.9%
21212	PACK OF 72 RETROSPOT CAKE CASES	$6,493	13,354	-70.7%	↑ Accelerating	1.5%
84077	WORLD WAR 2 GLIDERS ASSTD DESIGNS	$6,233	23,355	-54.7%	↑ Accelerating	0.2%
23843	PAPER CRAFT , LITTLE BIRDIE	$0	80,995	—	—	100.0%

# ---- Table 7: Decliner Watchlist (no 3M momentum) ----
decliners_top <- base_top %>%
  dplyr::left_join(trend_top, by = "StockCode") %>%
  dplyr::mutate(
    yoy_flag   = !is.na(YoY) & YoY < 0,
    slope_flag = !is.na(slope) & slope < 0
  ) %>%
  dplyr::filter(yoy_flag | slope_flag) %>%
  dplyr::transmute(
    SKU           = StockCode,
    Description   = dplyr::coalesce(Description, "(No description)"),
    Revenue_num   = dplyr::if_else(is.na(cur_rev),   0, cur_rev),
    Units_num     = dplyr::if_else(is.na(cur_units), 0, cur_units),
    YoY_chr       = dplyr::if_else(is.na(YoY), "—", scales::percent(YoY, accuracy = 0.1)),
    Return_chr    = dplyr::if_else(is.na(ReturnRate), "—", scales::percent(ReturnRate, accuracy = 0.1)),
    Trend_chr     = dplyr::case_when(
      is.na(slope) ~ "—",
      slope < 0    ~ "↓ Declining",
      slope > 0    ~ "↑ Improving",
      TRUE         ~ "→ Flat"
    )
  ) %>%
  dplyr::arrange(dplyr::desc(Revenue_num))

if (nrow(decliners_top) == 0) {
  tibble::tibble(Note = "No top-10 SKUs meet decliner criteria (negative YoY or negative 6M trend).") %>%
    kableExtra::kbl(
      caption = paste0("Table 7. Decliner Watchlist — Top 10 Most Frequently Purchased (", current_period, ")")
    ) %>%
    kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
} else {
  decliners_top %>%
    dplyr::mutate(
      Revenue = scales::dollar(Revenue_num, accuracy = 1),
      Units   = formatC(Units_num, big.mark = ",", format = "f", digits = 0)
    ) %>%
    dplyr::transmute(
      SKU, Description, Revenue, Units,
      `YoY %`       = YoY_chr,
      `Return Rate` = Return_chr,
      `6M Trend`    = Trend_chr
    ) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), ~ ifelse(is.na(.), "—", as.character(.)))) %>%
    kableExtra::kbl(
      caption = paste0("Table 7. Decliner Watchlist — Top 10 Most Frequently Purchased (", current_period, ")"),
      align   = c("l","l","r","r","r","r","l")
    ) %>%
    kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
}

Table 7. Decliner Watchlist — Top 10 Most Frequently Purchased (2011-2012)
SKU	Description	Revenue	Units	YoY %	Return Rate	6M Trend
85099B	JUMBO BAG RED RETROSPOT	$44,289	24,395	-45.2%	4.0%	↑ Improving
85123A	WHITE HANGING HEART T-LIGHT HOLDER	$41,020	14,774	-65.3%	0.3%	↑ Improving
84879	ASSORTED COLOUR BIRD ORNAMENT	$32,092	20,339	-51.6%	0.2%	↑ Improving
22197	POPCORN HOLDER	$22,598	30,309	-22.4%	1.4%	↑ Improving
22952	60 CAKE CASES VINTAGE CHRISTMAS	$6,705	13,561	-20.1%	1.9%	↑ Improving
21212	PACK OF 72 RETROSPOT CAKE CASES	$6,493	13,354	-70.7%	1.5%	↓ Declining
84077	WORLD WAR 2 GLIDERS ASSTD DESIGNS	$6,233	23,355	-54.7%	0.2%	↑ Improving

suppressPackageStartupMessages({
  library(dplyr); library(lubridate); library(tidyr)
  library(readr); library(kableExtra); library(scales)
})

# --- Identify top 10 most frequently purchased items ---
top10_freq <- no_NA_df %>%
  dplyr::filter(Quantity > 0) %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(total_units = sum(Quantity, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(total_units)) %>%
  dplyr::slice_head(n = 10)

top10_ids <- top10_freq$StockCode

# --- Filter dataset to top 10 items only ---
df_top10 <- no_NA_df %>%
  dplyr::filter(StockCode %in% top10_ids)

# --- Inventory Parameters ---
service_level <- 0.95
z_value <- qnorm(service_level)
holding_cost_per_unit_per_year <- 2.00
order_cost_per_order <- 50.00
working_days_per_year <- 365
lead_time_days <- 7

# --- Daily demand (recent 90 days for top 10) ---
df_daily <- df_top10 %>%
  dplyr::mutate(Date = as.Date(InvoiceDate)) %>%
  dplyr::group_by(StockCode, Description, Date) %>%
  dplyr::summarise(units = sum(Quantity[Quantity > 0], na.rm = TRUE), .groups = "drop")

last_date <- max(df_daily$Date, na.rm = TRUE)
start_lookback <- last_date - days(89)

df_scope <- df_daily %>%
  dplyr::filter(Date >= start_lookback & Date <= last_date)

# --- Demand stats per SKU ---
demand_stats <- df_scope %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(
    mean_daily = mean(units, na.rm = TRUE),
    sd_daily   = sd(units, na.rm = TRUE),
    days_obs   = n(),
    .groups = "drop"
  )

# --- Compute Safety Stock, ROP, EOQ ---
inv_opt <- demand_stats %>%
  dplyr::mutate(
    sd_lead = sd_daily * sqrt(lead_time_days),
    SafetyStock = z_value * sd_lead,
    ReorderPoint = pmax(0, mean_daily * lead_time_days + SafetyStock),
    AnnualDemand = mean_daily * working_days_per_year,
    EOQ = ifelse(holding_cost_per_unit_per_year > 0,
                 sqrt((2 * AnnualDemand * order_cost_per_order) /
                        holding_cost_per_unit_per_year),
                 NA_real_)
  ) %>%
  dplyr::mutate(
    SafetyStock = round(SafetyStock, 0),
    ReorderPoint = round(ReorderPoint, 0),
    EOQ = round(EOQ, 0)
  )

# --- Table 8: Inventory Optimization (Top 10 SKUs) ---
inv_opt %>%
  dplyr::select(
    SKU = StockCode,
    Description,
    `Mean Daily Demand` = mean_daily,
    `Daily SD` = sd_daily,
    `Safety Stock` = SafetyStock,
    `Reorder Point` = ReorderPoint,
    `EOQ (units)` = EOQ
  ) %>%
  dplyr::mutate(
    `Mean Daily Demand` = round(`Mean Daily Demand`, 2),
    `Daily SD` = round(`Daily SD`, 2)
  ) %>%
  kableExtra::kbl(
    caption = "Table 8. Inventory Stock Optimization — Top 10 Most Frequently Purchased Items",
    align = c("l","l","r","r","r","r","r")
  ) %>%
  kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))

Table 8. Inventory Stock Optimization — Top 10 Most Frequently Purchased Items
SKU	Description	Mean Daily Demand	Daily SD	Safety Stock	Reorder Point	EOQ (units)
17003	BROCADE RING PURSE	124.69	254.46	1107	1980	1508
21212	PACK OF 72 RETROSPOT CAKE CASES	93.35	121.02	527	1180	1508
21977	PACK OF 60 PINK PAISLEY CAKE CASES	65.51	120.62	525	983	1508
23166	MEDIUM CERAMIC TOP STORAGE JAR	24.76	41.32	180	353	1508
23843	PAPER CRAFT , LITTLE BIRDIE	80995.00	NA	NA	NA	1508
84077	WORLD WAR 2 GLIDERS ASSTD DESIGNS	239.03	566.44	2465	4138	1508
84879	ASSORTED COLOUR BIRD ORNAMENT	148.60	173.45	755	1795	1508
84991	60 TEATIME FAIRY CAKE CASES	75.70	119.47	520	1050	1508
85099B	JUMBO BAG RED RETROSPOT	194.88	248.15	1080	2444	1508
85123A	CREAM HANGING HEART T-LIGHT HOLDER	28.50	38.89	169	369	1508
85123A	WHITE HANGING HEART T-LIGHT HOLDER	117.86	160.82	700	1525	1508

OnlineRetail

AC Band, Dominica Peri, Camille Casey

2025-09-23

R Markdown