R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MASS) 
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ MASS::select()  masks dplyr::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.95 loaded
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
library(GGally)
library(ROCR)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(tibble)
library(knitr)
library(here)
## here() starts at C:/Users/aliso/Documents/UTSA/Data Analytics Applications/OnlineRetail
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(broom)
library(tidyr)
# Read in 2009-2010

year1 <- read_excel("online_retail.xlsx", sheet = "Year 2009-2010")
 
# Read in 2010-2011

year2 <- read_excel("online_retail.xlsx", sheet = "Year 2010-2011")
glimpse(year1)
## Rows: 525,461
## Columns: 8
## $ Invoice       <chr> "489434", "489434", "489434", "489434", "489434", "48943…
## $ StockCode     <chr> "85048", "79323P", "79323W", "22041", "21232", "22064", …
## $ Description   <chr> "15CM CHRISTMAS GLASS BALL 20 LIGHTS", "PINK CHERRY LIGH…
## $ Quantity      <dbl> 12, 12, 12, 48, 24, 24, 24, 10, 12, 12, 24, 12, 10, 18, …
## $ InvoiceDate   <dttm> 2009-12-01 07:45:00, 2009-12-01 07:45:00, 2009-12-01 07…
## $ Price         <dbl> 6.95, 6.75, 6.75, 2.10, 1.25, 1.65, 1.25, 5.95, 2.55, 3.…
## $ `Customer ID` <dbl> 13085, 13085, 13085, 13085, 13085, 13085, 13085, 13085, …
## $ Country       <chr> "United Kingdom", "United Kingdom", "United Kingdom", "U…
glimpse(year2)
## Rows: 541,910
## Columns: 8
## $ Invoice       <chr> "536365", "536365", "536365", "536365", "536365", "53636…
## $ StockCode     <chr> "85123A", "71053", "84406B", "84029G", "84029E", "22752"…
## $ Description   <chr> "WHITE HANGING HEART T-LIGHT HOLDER", "WHITE METAL LANTE…
## $ Quantity      <dbl> 6, 6, 8, 6, 6, 2, 6, 6, 6, 6, 3, 3, 3, 32, 6, 6, 8, 6, 6…
## $ InvoiceDate   <dttm> 2010-12-01 08:26:00, 2010-12-01 08:26:00, 2010-12-01 08…
## $ Price         <dbl> 2.55, 3.39, 2.75, 3.39, 3.39, 7.65, 4.25, 1.85, 1.85, 4.…
## $ `Customer ID` <dbl> 17850, 17850, 17850, 17850, 17850, 17850, 17850, 17850, …
## $ Country       <chr> "United Kingdom", "United Kingdom", "United Kingdom", "U…
anyNA(year1)
## [1] TRUE
anyNA(year2)
## [1] TRUE
na_count_yr1 <- sum(is.na(year1))
print(na_count_yr1) 
## [1] 110855
na_count_yr2 <- sum(is.na(year2))
print(na_count_yr2) 
## [1] 136534
no_NA_year1 <- na.omit(year1)

no_NA_year2 <- na.omit(year2)
anyNA(no_NA_year1)
## [1] FALSE
anyNA(no_NA_year2)
## [1] FALSE
summary(no_NA_year1)
##    Invoice           StockCode         Description           Quantity       
##  Length:417534      Length:417534      Length:417534      Min.   :-9360.00  
##  Class :character   Class :character   Class :character   1st Qu.:    2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    4.00  
##                                                           Mean   :   12.76  
##                                                           3rd Qu.:   12.00  
##                                                           Max.   :19152.00  
##   InvoiceDate                      Price            Customer ID   
##  Min.   :2009-12-01 07:45:00   Min.   :    0.000   Min.   :12346  
##  1st Qu.:2010-03-26 11:26:00   1st Qu.:    1.250   1st Qu.:13983  
##  Median :2010-07-08 19:12:00   Median :    1.950   Median :15311  
##  Mean   :2010-07-01 01:12:19   Mean   :    3.888   Mean   :15361  
##  3rd Qu.:2010-10-14 14:08:00   3rd Qu.:    3.750   3rd Qu.:16799  
##  Max.   :2010-12-09 20:01:00   Max.   :25111.090   Max.   :18287  
##    Country         
##  Length:417534     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
summary(no_NA_year2)
##    Invoice           StockCode         Description           Quantity        
##  Length:406830      Length:406830      Length:406830      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     2.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     5.00  
##                                                           Mean   :    12.06  
##                                                           3rd Qu.:    12.00  
##                                                           Max.   : 80995.00  
##   InvoiceDate                      Price            Customer ID   
##  Min.   :2010-12-01 08:26:00   Min.   :    0.000   Min.   :12346  
##  1st Qu.:2011-04-06 15:02:00   1st Qu.:    1.250   1st Qu.:13953  
##  Median :2011-07-31 11:48:00   Median :    1.950   Median :15152  
##  Mean   :2011-07-10 16:31:30   Mean   :    3.461   Mean   :15288  
##  3rd Qu.:2011-10-20 13:06:00   3rd Qu.:    3.750   3rd Qu.:16791  
##  Max.   :2011-12-09 12:50:00   Max.   :38970.000   Max.   :18287  
##    Country         
##  Length:406830     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
no_NA_df <- bind_rows(no_NA_year1, no_NA_year2)

# Rename column in position 7 to 'CustomerID'
colnames(no_NA_df)[7] <- 'CustomerID'
head(no_NA_df)
## # A tibble: 6 × 8
##   Invoice StockCode Description    Quantity InvoiceDate         Price CustomerID
##   <chr>   <chr>     <chr>             <dbl> <dttm>              <dbl>      <dbl>
## 1 489434  85048     "15CM CHRISTM…       12 2009-12-01 07:45:00  6.95      13085
## 2 489434  79323P    "PINK CHERRY …       12 2009-12-01 07:45:00  6.75      13085
## 3 489434  79323W    "WHITE CHERRY…       12 2009-12-01 07:45:00  6.75      13085
## 4 489434  22041     "RECORD FRAME…       48 2009-12-01 07:45:00  2.1       13085
## 5 489434  21232     "STRAWBERRY C…       24 2009-12-01 07:45:00  1.25      13085
## 6 489434  22064     "PINK DOUGHNU…       24 2009-12-01 07:45:00  1.65      13085
## # ℹ 1 more variable: Country <chr>
library(dplyr)
library(lubridate)
library(ggplot2)
#Top selling products for 2009-2011

no_NA_revenue<-no_NA_df |> group_by(StockCode, Description)|>summarise(sales=sum(Quantity*Price))|>ungroup()|> arrange(desc(sales))
## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.
no_NA_revenue
## # A tibble: 5,302 × 3
##    StockCode Description                          sales
##    <chr>     <chr>                                <dbl>
##  1 22423     REGENCY CAKESTAND 3 TIER           269737.
##  2 85123A    WHITE HANGING HEART T-LIGHT HOLDER 242701.
##  3 85099B    JUMBO BAG RED RETROSPOT            134845.
##  4 84879     ASSORTED COLOUR BIRD ORNAMENT      126354.
##  5 POST      POSTAGE                            112249.
##  6 47566     PARTY BUNTING                      102686.
##  7 22086     PAPER CHAIN KIT 50'S CHRISTMAS      78367.
##  8 79321     CHILLI LIGHTS                       72229.
##  9 21137     BLACK RECORD COVER FRAME            67127.
## 10 85099F    JUMBO BAG STRAWBERRY                64089.
## # ℹ 5,292 more rows
#Look for time series trends

pacman::p_load("pacman","tidyverse", "openxlsx", "forecast", "modeltime", "parsnip", "rsample", "timetk", "xts", "tidyquant", "feasts", "prophet")
no_NA_dlyrevenue<-no_NA_df |> group_by(InvoiceDate)|>summarise(dlysales=sum(Quantity*Price))|>ungroup()|>arrange(desc(dlysales)) 

no_NA_dlyrevenue
## # A tibble: 41,439 × 2
##    InvoiceDate         dlysales
##    <dttm>                 <dbl>
##  1 2011-12-09 09:15:00  168470.
##  2 2011-01-18 10:01:00   77184.
##  3 2010-01-07 12:34:00   44052.
##  4 2011-06-10 15:28:00   39597.
##  5 2010-09-27 16:59:00   33168.
##  6 2010-12-07 16:43:00   32562.
##  7 2011-09-20 11:05:00   31698.
##  8 2010-10-14 09:46:00   26007.
##  9 2011-06-15 13:37:00   23427.
## 10 2010-07-15 15:29:00   22863.
## # ℹ 41,429 more rows
no_NA_top10revenue<-no_NA_df |>  filter(StockCode != "POST") |>  group_by(StockCode, Description)|>summarise(sales=sum(Quantity*Price))|>ungroup()|> arrange(desc(sales))|> slice_head(n=10)
## `summarise()` has grouped output by 'StockCode'. You can override using the
## `.groups` argument.
no_NA_top10revenue
## # A tibble: 10 × 3
##    StockCode Description                          sales
##    <chr>     <chr>                                <dbl>
##  1 22423     REGENCY CAKESTAND 3 TIER           269737.
##  2 85123A    WHITE HANGING HEART T-LIGHT HOLDER 242701.
##  3 85099B    JUMBO BAG RED RETROSPOT            134845.
##  4 84879     ASSORTED COLOUR BIRD ORNAMENT      126354.
##  5 47566     PARTY BUNTING                      102686.
##  6 22086     PAPER CHAIN KIT 50'S CHRISTMAS      78367.
##  7 79321     CHILLI LIGHTS                       72229.
##  8 21137     BLACK RECORD COVER FRAME            67127.
##  9 85099F    JUMBO BAG STRAWBERRY                64089.
## 10 82484     WOOD BLACK BOARD ANT WHITE FINISH   60188.
ggplot(no_NA_top10revenue, aes(x = reorder(Description, sales), y = sales)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # To flip the bars horizontally for better readability
  labs(title = "Top Selling Products (2009-2011)", 
       x = "Product", 
       y = "Total Sales (in $10k)") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))  

ggplot(data=no_NA_dlyrevenue, aes(x=InvoiceDate, y=dlysales)) + geom_line() + geom_ma(ma_fun = SMA, color = "orange", n = 4) + geom_ma(ma_fun = SMA, n=5, color = "lightblue")+ theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

no_NA_df$sales <- no_NA_df$Quantity*no_NA_df$Price

head(no_NA_df)
## # A tibble: 6 × 9
##   Invoice StockCode Description    Quantity InvoiceDate         Price CustomerID
##   <chr>   <chr>     <chr>             <dbl> <dttm>              <dbl>      <dbl>
## 1 489434  85048     "15CM CHRISTM…       12 2009-12-01 07:45:00  6.95      13085
## 2 489434  79323P    "PINK CHERRY …       12 2009-12-01 07:45:00  6.75      13085
## 3 489434  79323W    "WHITE CHERRY…       12 2009-12-01 07:45:00  6.75      13085
## 4 489434  22041     "RECORD FRAME…       48 2009-12-01 07:45:00  2.1       13085
## 5 489434  21232     "STRAWBERRY C…       24 2009-12-01 07:45:00  1.25      13085
## 6 489434  22064     "PINK DOUGHNU…       24 2009-12-01 07:45:00  1.65      13085
## # ℹ 2 more variables: Country <chr>, sales <dbl>
ggplot(data=no_NA_df, aes(x=InvoiceDate, y=sales)) + geom_line() + geom_ma(ma_fun = SMA, color = "orange", n = 4) + geom_ma(ma_fun = SMA, n=5, color = "lightblue")+ theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(dplyr)

pacman::p_load(pacman,
  tidyverse, openxlsx, forecast, modeltime, parsnip, rsample, timetk, xts, tidyquant, feasts, prophet)
d3 <- no_NA_df %>%
  mutate(Date = as.Date(InvoiceDate, format = "%Y-%m-%d")) %>%
  group_by(Date) %>%
  summarise(Daily_Sales = sum(sales))


head(d3)
## # A tibble: 6 × 2
##   Date       Daily_Sales
##   <date>           <dbl>
## 1 2009-12-01      42708.
## 2 2009-12-02      52578.
## 3 2009-12-03      61534.
## 4 2009-12-04      33687.
## 5 2009-12-05       9803.
## 6 2009-12-06      24284.
ggplot(d3, aes(x = Date, y = Daily_Sales)) +
  geom_line(color = "blue") +
  labs(title = "Daily Sales Trend", x = "Date", y = "Sales") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

sales_ts <- ts(d3$Daily_Sales, start = c(year(min(d3$Date)), month(min(d3$Date))), frequency = 365)
# Fit ARIMA model
arima_model <- auto.arima(sales_ts)

# Forecast future demand
forecast_arima <- forecast(arima_model, h = 30)  # Sales forecast for the next 30 days

# Plot the forecast
autoplot(forecast_arima) +
  labs(title = "ARIMA Forecast", x = "Date", y = "Sales")

# Fit ETS model
ets_model <- ets(sales_ts)
## Warning in ets(sales_ts): I can't handle data with frequency greater than 24.
## Seasonality will be ignored. Try stlf() if you need seasonal forecasts.
# Forecast future demand
forecast_ets <- forecast(ets_model, h = 30)

# Plot the forecast
autoplot(forecast_ets) +
  labs(title = "ETS Forecast", x = "Date", y = "Sales")

suppressPackageStartupMessages({
  library(dplyr)
  library(lubridate)
  library(tidyr)
  library(purrr)
  library(forecast)
  library(ggplot2)
  library(readr)
})

# ---------------- USER INPUTS ----------------
exclude_codes <- c("POST")   # e.g., c("22423")
horizon_days  <- 60    # number of future days to forecast
lookback_days_for_top <- 0  # 0 = all history
# --------------------------------------------

# Ensure required columns exist: InvoiceDate, StockCode, Quantity, Price
stopifnot(all(c("InvoiceDate","StockCode","Quantity","Price") %in% names(no_NA_df)))

# 1) Standardize types
df <- no_NA_df %>%
  dplyr::mutate(
    InvoiceDate = as.Date(InvoiceDate),
    StockCode   = as.character(StockCode),
    Quantity    = as.numeric(Quantity),
    Price       = as.numeric(Price)
  ) %>%
  dplyr::filter(!is.na(InvoiceDate), !is.na(StockCode), !is.na(Quantity), !is.na(Price))

# If Description missing, create it (keeps code robust)
if (!"Description" %in% names(df)) {
  df <- df %>% dplyr::mutate(Description = NA_character_)
} else {
  df <- df %>% dplyr::mutate(Description = as.character(Description))
}

# 2) Top-10 products by revenue (optional lookback)
df_top_scope <- if (lookback_days_for_top > 0) {
  cutoff <- max(df$InvoiceDate, na.rm = TRUE) - lookback_days_for_top
  df %>% dplyr::filter(InvoiceDate > cutoff)
} else df

top10_tbl <- df_top_scope %>%
  dplyr::filter(!StockCode %in% exclude_codes) %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(sales = sum(Quantity * Price, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(sales)) %>%
  dplyr::slice_head(n = 10)

top10_ids <- top10_tbl$StockCode
message("Top 10 StockCodes: ", paste(top10_ids, collapse = ", "))
## Top 10 StockCodes: 22423, 85123A, 85099B, 84879, 47566, 22086, 79321, 21137, 85099F, 82484
# 3) Aggregate DAILY demand & fill missing days
daily_raw <- df %>%
  dplyr::filter(StockCode %in% top10_ids) %>%
  dplyr::group_by(StockCode, InvoiceDate) %>%
  dplyr::summarise(units = sum(Quantity), .groups = "drop")

daily <- daily_raw %>%
  dplyr::group_by(StockCode) %>%
  tidyr::complete(InvoiceDate = seq(min(InvoiceDate), max(InvoiceDate), by = "1 day"),
                  fill = list(units = 0)) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(StockCode, InvoiceDate)

# 4) Modeling helpers
safe_fit <- function(y) {
  fit <- tryCatch(
    forecast::auto.arima(y, seasonal = TRUE, stepwise = FALSE, approximation = FALSE),
    error = function(e) NULL
  )
  if (is.null(fit)) forecast::ets(y) else fit
}

to_ts <- function(df_prod) {
  stats::ts(df_prod$units, frequency = 7)  # daily with weekly seasonality
}

# 5) Fit + forecast per product
results <- top10_ids %>%
  purrr::map_dfr(function(pid) {
    d <- daily %>% dplyr::filter(StockCode == pid)

    if (sum(d$units > 0, na.rm = TRUE) < 3) {
      fc_vals <- rep(mean(d$units, na.rm = TRUE), horizon_days)
      tibble::tibble(
        StockCode = pid,
        day_ahead = 1:horizon_days,
        forecast = fc_vals,
        lo95 = NA_real_,
        hi95 = NA_real_
      )
    } else {
      y  <- to_ts(d)
      fit <- safe_fit(y)
      fc  <- forecast::forecast(fit, h = horizon_days)

      tibble::tibble(
        StockCode = pid,
        day_ahead = 1:horizon_days,
        forecast = as.numeric(fc$mean),
        lo95 = as.numeric(fc$lower[, "95%"]),
        hi95 = as.numeric(fc$upper[, "95%"])
      )
    }
  }) %>%
  dplyr::left_join(
    top10_tbl %>% dplyr::select(StockCode, Description),
    by = "StockCode"
  )

# 6) Save outputs
readr::write_csv(results, "top10_daily_forecasts2.csv")
message("Saved: top10_daily_forecasts2.csv")
## Saved: top10_daily_forecasts2.csv
# 7) Plot one product’s history + forecast
plot_one <- function(pid) {
  df_hist <- daily   %>% dplyr::filter(StockCode == pid)
  df_fc   <- results %>% dplyr::filter(StockCode == pid)

  title_txt <- paste0(pid, if (!all(is.na(df_fc$Description))) {
    paste0(" – ", unique(stats::na.omit(df_fc$Description))[1])
  } else "")

  last_hist <- max(df_hist$InvoiceDate, na.rm = TRUE)

 ggplot2::ggplot() +
    # historical demand in blue
    ggplot2::geom_line(data = df_hist, 
                       ggplot2::aes(x = InvoiceDate, y = units), 
                       color = "steelblue", linewidth = 1) +
    # forecast interval in light red
    ggplot2::geom_ribbon(data = df_fc,
                         ggplot2::aes(x = last_hist + day_ahead,
                                      ymin = lo95, ymax = hi95),
                         fill = "salmon", alpha = 0.3) +
    # forecast line in dark red
    ggplot2::geom_line(data = df_fc,
                       ggplot2::aes(x = last_hist + day_ahead, y = forecast),
                       color = "firebrick", linewidth = 1) +
    ggplot2::labs(title = paste("Daily Demand & Forecast:", title_txt),
                  x = "Date", y = "Units") +
    ggplot2::theme_minimal()
}

# Example:
# plot_one(top10_ids[1])
for (pid in top10_ids) {
  print(plot_one(pid))
}

suppressPackageStartupMessages({
  library(dplyr); library(lubridate); library(tidyr)
})

# ---- Choose how to define a "Period" ----
# Set fiscal_start_month = 1 for calendar years (Jan–Dec -> "YYYY-YYYY+1" labels still okay),
# or use e.g. 7 for fiscal (Jul–Jun), 4 for Apr–Mar, etc.
fiscal_start_month <- 7  # << change to 1 for calendar-year style periods

no_NA_df <- no_NA_df %>%
  dplyr::mutate(
    InvoiceDate = as.POSIXct(InvoiceDate),
    Date        = as.Date(InvoiceDate),
    StockCode   = as.character(StockCode),
    Description = as.character(Description),
    Quantity    = as.numeric(Quantity),
    Price       = as.numeric(Price),
    sales       = Quantity * Price
  ) %>%
  # Create Period dynamically from Date and fiscal start
  dplyr::mutate(
    Year  = lubridate::year(Date),
    Month = lubridate::month(Date),
    PeriodStartYear = dplyr::if_else(Month < fiscal_start_month, Year - 1L, Year),
    PeriodEndYear   = PeriodStartYear + 1L,
    Period = paste0(PeriodStartYear, "-", PeriodEndYear)
  )

# Sanity check (optional)
# print(table(no_NA_df$Period, useNA="ifany"))
### 1) Auto-select current & prior periods, compute KPIs (dedup safe)

suppressPackageStartupMessages({
  library(dplyr); library(lubridate); library(tidyr); library(purrr); library(scales)
})

# ---- Pick the latest two observed Periods dynamically ----
periods_found <- no_NA_df %>%
  dplyr::distinct(Period, PeriodStartYear, PeriodEndYear) %>%
  dplyr::arrange(dplyr::desc(PeriodStartYear), dplyr::desc(PeriodEndYear))

stopifnot(nrow(periods_found) >= 1)
current_period <- periods_found$Period[1]
prior_period   <- if (nrow(periods_found) >= 2) periods_found$Period[2] else NA_character_

# ---- Revenue / Units / Returns by Period + SKU + Description ----
rev_by_period <- no_NA_df %>%
  dplyr::group_by(Period, StockCode, Description) %>%
  dplyr::summarise(
    Revenue       = sum(sales, na.rm = TRUE),
    Units_shipped = sum(Quantity[Quantity > 0], na.rm = TRUE),
    Units_return  = sum(abs(Quantity[Quantity < 0]), na.rm = TRUE),
    .groups = "drop"
  )

# ---- Collapse to one row per SKU for each period (dedupe fix) ----
cur <- rev_by_period %>%
  dplyr::filter(Period == current_period) %>%
  dplyr::group_by(StockCode) %>%
  dplyr::summarise(
    # pick representative non-NA Description by highest Revenue
    Description = {
      tmp <- dplyr::arrange(pick(dplyr::everything()), dplyr::desc(Revenue))
      dplyr::first(stats::na.omit(tmp$Description))
    },
    cur_rev     = sum(Revenue, na.rm = TRUE),
    cur_units   = sum(Units_shipped, na.rm = TRUE),
    cur_ret     = sum(Units_return, na.rm = TRUE),
    .groups = "drop"
  )

prior <- rev_by_period %>%
  dplyr::filter(Period == prior_period) %>%
  dplyr::group_by(StockCode) %>%
  dplyr::summarise(
    prior_rev = sum(Revenue, na.rm = TRUE),
    .groups = "drop"
  )

# ---- YoY + ReturnRate ----
yoy_tbl <- cur %>%
  dplyr::left_join(prior, by = "StockCode") %>%
  dplyr::mutate(
    YoY = dplyr::case_when(
      is.na(prior_rev) ~ NA_real_,
      prior_rev == 0 & cur_rev > 0 ~ NA_real_,
      TRUE ~ (cur_rev - prior_rev) / prior_rev
    ),
    ReturnRate = dplyr::case_when(
      is.na(cur_units) | cur_units <= 0 ~ NA_real_,
      TRUE ~ cur_ret / cur_units
    )
  )

# ---- Build helpers for Momentum (current period) & 6M Trend ----
cur_df <- no_NA_df %>% dplyr::filter(Period == current_period)

last_date <- max(cur_df$Date, na.rm = TRUE)
last_complete_month <- lubridate::floor_date(last_date, "month") - lubridate::days(1)
end_3m   <- last_complete_month
start_3m <- lubridate::floor_date(end_3m %m-% months(2), "month")
end_prev <- start_3m - lubridate::days(1)
start_prev <- lubridate::floor_date(end_prev %m-% months(2), "month")

momentum_tbl <- cur_df %>%
  dplyr::filter(Date >= start_prev & Date <= end_3m) %>%
  dplyr::mutate(
    bucket = dplyr::case_when(
      Date >= start_3m & Date <= end_3m ~ "last3",
      Date >= start_prev & Date <= end_prev ~ "prev3",
      TRUE ~ NA_character_
    )
  ) %>%
  dplyr::filter(!is.na(bucket)) %>%
  dplyr::group_by(StockCode, bucket) %>%
  dplyr::summarise(rev = sum(sales, na.rm = TRUE), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = bucket, values_from = rev, values_fill = 0) %>%
  dplyr::mutate(
    Momentum3M = dplyr::if_else(prev3 > 0, (last3 / prev3) - 1, NA_real_),
    MomentumLabel = dplyr::case_when(
      is.na(Momentum3M)  ~ "—",
      Momentum3M > 0.05  ~ "↑ Accelerating",
      Momentum3M < -0.05 ~ "↓ Softening",
      TRUE               ~ "→ Stable"
    )
  ) %>%
  dplyr::select(StockCode, Momentum3M, MomentumLabel)

six_end   <- last_complete_month
six_start <- lubridate::floor_date(six_end %m-% months(5), "month")

monthly_cur <- cur_df %>%
  dplyr::filter(Date >= six_start & Date <= six_end) %>%
  dplyr::mutate(ym = lubridate::floor_date(Date, "month")) %>%
  dplyr::group_by(StockCode, ym) %>%
  dplyr::summarise(rev = sum(sales, na.rm = TRUE), .groups = "drop")

# ---- 6-Month Trend (safe version) ----
trend_tbl <- monthly_cur %>%
  dplyr::group_by(StockCode) %>%
  dplyr::mutate(t = as.numeric(ym)) %>%
  tidyr::nest() %>%
  dplyr::mutate(
    model = purrr::map(data, ~ tryCatch(
      stats::lm(rev ~ t, data = .x),
      error = function(e) NULL
    )),
    slope = purrr::map_dbl(model, ~ {
      if (is.null(.x)) NA_real_ else stats::coef(.x)[["t"]]
    })
  ) %>%
  dplyr::select(StockCode, slope) %>%
  dplyr::ungroup()

# ---- Base for current period ----
base_tbl <- yoy_tbl %>%
  dplyr::left_join(momentum_tbl, by = "StockCode")
### 2) Limit to Top-10 most frequently purchased SKUs (current period) and render tables


suppressPackageStartupMessages({ library(kableExtra); library(scales) })

# ---- Top 10 by purchase frequency (units) in current period ----
top10_freq <- no_NA_df %>%
  dplyr::filter(Period == current_period, Quantity > 0) %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(total_units = sum(Quantity, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(total_units)) %>%
  dplyr::slice_head(n = 10)
top10_ids <- top10_freq$StockCode

base_top  <- base_tbl  %>% dplyr::filter(StockCode %in% top10_ids)
trend_top <- trend_tbl %>% dplyr::filter(StockCode %in% top10_ids)

# ---- Table 6: Product Leaderboard (exact columns) ----
tbl6 <- base_top %>%
  dplyr::arrange(dplyr::desc(cur_rev)) %>%
  dplyr::transmute(
    SKU           = StockCode,
    Description   = dplyr::coalesce(Description, "(No description)"),
    Revenue_num   = dplyr::if_else(is.na(cur_rev),   0, cur_rev),
    Units_num     = dplyr::if_else(is.na(cur_units), 0, cur_units),
    YoY_chr       = dplyr::if_else(is.na(YoY), "—", scales::percent(YoY, accuracy = 0.1)),
    Momentum_chr  = dplyr::coalesce(MomentumLabel, "—"),
    Return_chr    = dplyr::if_else(is.na(ReturnRate), "—", scales::percent(ReturnRate, accuracy = 0.1))
  ) %>%
  dplyr::mutate(
    Revenue = scales::dollar(Revenue_num, accuracy = 1),
    Units   = formatC(Units_num, big.mark = ",", format = "f", digits = 0)
  ) %>%
  dplyr::transmute(
    SKU, Description, Revenue, Units,
    `YoY %` = YoY_chr,
    `3M Momentum` = Momentum_chr,
    `Return Rate` = Return_chr
  ) %>%
  dplyr::mutate(dplyr::across(dplyr::everything(), ~ ifelse(is.na(.), "—", as.character(.))))

tbl6 %>%
  kableExtra::kbl(
    caption = paste0("Table 6. Product Leaderboard — Top 10 Most Frequently Purchased (", current_period, ")"),
    align   = c("l","l","r","r","r","l","r")
  ) %>%
  kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
Table 6. Product Leaderboard — Top 10 Most Frequently Purchased (2011-2012)
SKU Description Revenue Units YoY % 3M Momentum Return Rate
23084 RABBIT NIGHT LIGHT $46,013 24,484 797.8% ↑ Accelerating 0.4%
85099B JUMBO BAG RED RETROSPOT $44,289 24,395 -45.2% ↑ Accelerating 4.0%
85123A WHITE HANGING HEART T-LIGHT HOLDER $41,020 14,774 -65.3% ↑ Accelerating 0.3%
22086 PAPER CHAIN KIT 50’S CHRISTMAS $35,850 13,473 6.4% ↑ Accelerating 3.0%
84879 ASSORTED COLOUR BIRD ORNAMENT $32,092 20,339 -51.6% ↑ Accelerating 0.2%
22197 POPCORN HOLDER $22,598 30,309 -22.4% ↑ Accelerating 1.4%
22952 60 CAKE CASES VINTAGE CHRISTMAS $6,705 13,561 -20.1% ↑ Accelerating 1.9%
21212 PACK OF 72 RETROSPOT CAKE CASES $6,493 13,354 -70.7% ↑ Accelerating 1.5%
84077 WORLD WAR 2 GLIDERS ASSTD DESIGNS $6,233 23,355 -54.7% ↑ Accelerating 0.2%
23843 PAPER CRAFT , LITTLE BIRDIE $0 80,995 100.0%
# ---- Table 7: Decliner Watchlist (no 3M momentum) ----
decliners_top <- base_top %>%
  dplyr::left_join(trend_top, by = "StockCode") %>%
  dplyr::mutate(
    yoy_flag   = !is.na(YoY) & YoY < 0,
    slope_flag = !is.na(slope) & slope < 0
  ) %>%
  dplyr::filter(yoy_flag | slope_flag) %>%
  dplyr::transmute(
    SKU           = StockCode,
    Description   = dplyr::coalesce(Description, "(No description)"),
    Revenue_num   = dplyr::if_else(is.na(cur_rev),   0, cur_rev),
    Units_num     = dplyr::if_else(is.na(cur_units), 0, cur_units),
    YoY_chr       = dplyr::if_else(is.na(YoY), "—", scales::percent(YoY, accuracy = 0.1)),
    Return_chr    = dplyr::if_else(is.na(ReturnRate), "—", scales::percent(ReturnRate, accuracy = 0.1)),
    Trend_chr     = dplyr::case_when(
      is.na(slope) ~ "—",
      slope < 0    ~ "↓ Declining",
      slope > 0    ~ "↑ Improving",
      TRUE         ~ "→ Flat"
    )
  ) %>%
  dplyr::arrange(dplyr::desc(Revenue_num))

if (nrow(decliners_top) == 0) {
  tibble::tibble(Note = "No top-10 SKUs meet decliner criteria (negative YoY or negative 6M trend).") %>%
    kableExtra::kbl(
      caption = paste0("Table 7. Decliner Watchlist — Top 10 Most Frequently Purchased (", current_period, ")")
    ) %>%
    kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
} else {
  decliners_top %>%
    dplyr::mutate(
      Revenue = scales::dollar(Revenue_num, accuracy = 1),
      Units   = formatC(Units_num, big.mark = ",", format = "f", digits = 0)
    ) %>%
    dplyr::transmute(
      SKU, Description, Revenue, Units,
      `YoY %`       = YoY_chr,
      `Return Rate` = Return_chr,
      `6M Trend`    = Trend_chr
    ) %>%
    dplyr::mutate(dplyr::across(dplyr::everything(), ~ ifelse(is.na(.), "—", as.character(.)))) %>%
    kableExtra::kbl(
      caption = paste0("Table 7. Decliner Watchlist — Top 10 Most Frequently Purchased (", current_period, ")"),
      align   = c("l","l","r","r","r","r","l")
    ) %>%
    kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
}
Table 7. Decliner Watchlist — Top 10 Most Frequently Purchased (2011-2012)
SKU Description Revenue Units YoY % Return Rate 6M Trend
85099B JUMBO BAG RED RETROSPOT $44,289 24,395 -45.2% 4.0% ↑ Improving
85123A WHITE HANGING HEART T-LIGHT HOLDER $41,020 14,774 -65.3% 0.3% ↑ Improving
84879 ASSORTED COLOUR BIRD ORNAMENT $32,092 20,339 -51.6% 0.2% ↑ Improving
22197 POPCORN HOLDER $22,598 30,309 -22.4% 1.4% ↑ Improving
22952 60 CAKE CASES VINTAGE CHRISTMAS $6,705 13,561 -20.1% 1.9% ↑ Improving
21212 PACK OF 72 RETROSPOT CAKE CASES $6,493 13,354 -70.7% 1.5% ↓ Declining
84077 WORLD WAR 2 GLIDERS ASSTD DESIGNS $6,233 23,355 -54.7% 0.2% ↑ Improving
suppressPackageStartupMessages({
  library(dplyr); library(lubridate); library(tidyr)
  library(readr); library(kableExtra); library(scales)
})

# --- Identify top 10 most frequently purchased items ---
top10_freq <- no_NA_df %>%
  dplyr::filter(Quantity > 0) %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(total_units = sum(Quantity, na.rm = TRUE), .groups = "drop") %>%
  dplyr::arrange(dplyr::desc(total_units)) %>%
  dplyr::slice_head(n = 10)

top10_ids <- top10_freq$StockCode

# --- Filter dataset to top 10 items only ---
df_top10 <- no_NA_df %>%
  dplyr::filter(StockCode %in% top10_ids)

# --- Inventory Parameters ---
service_level <- 0.95
z_value <- qnorm(service_level)
holding_cost_per_unit_per_year <- 2.00
order_cost_per_order <- 50.00
working_days_per_year <- 365
lead_time_days <- 7

# --- Daily demand (recent 90 days for top 10) ---
df_daily <- df_top10 %>%
  dplyr::mutate(Date = as.Date(InvoiceDate)) %>%
  dplyr::group_by(StockCode, Description, Date) %>%
  dplyr::summarise(units = sum(Quantity[Quantity > 0], na.rm = TRUE), .groups = "drop")

last_date <- max(df_daily$Date, na.rm = TRUE)
start_lookback <- last_date - days(89)

df_scope <- df_daily %>%
  dplyr::filter(Date >= start_lookback & Date <= last_date)

# --- Demand stats per SKU ---
demand_stats <- df_scope %>%
  dplyr::group_by(StockCode, Description) %>%
  dplyr::summarise(
    mean_daily = mean(units, na.rm = TRUE),
    sd_daily   = sd(units, na.rm = TRUE),
    days_obs   = n(),
    .groups = "drop"
  )

# --- Compute Safety Stock, ROP, EOQ ---
inv_opt <- demand_stats %>%
  dplyr::mutate(
    sd_lead = sd_daily * sqrt(lead_time_days),
    SafetyStock = z_value * sd_lead,
    ReorderPoint = pmax(0, mean_daily * lead_time_days + SafetyStock),
    AnnualDemand = mean_daily * working_days_per_year,
    EOQ = ifelse(holding_cost_per_unit_per_year > 0,
                 sqrt((2 * AnnualDemand * order_cost_per_order) /
                        holding_cost_per_unit_per_year),
                 NA_real_)
  ) %>%
  dplyr::mutate(
    SafetyStock = round(SafetyStock, 0),
    ReorderPoint = round(ReorderPoint, 0),
    EOQ = round(EOQ, 0)
  )

# --- Table 8: Inventory Optimization (Top 10 SKUs) ---
inv_opt %>%
  dplyr::select(
    SKU = StockCode,
    Description,
    `Mean Daily Demand` = mean_daily,
    `Daily SD` = sd_daily,
    `Safety Stock` = SafetyStock,
    `Reorder Point` = ReorderPoint,
    `EOQ (units)` = EOQ
  ) %>%
  dplyr::mutate(
    `Mean Daily Demand` = round(`Mean Daily Demand`, 2),
    `Daily SD` = round(`Daily SD`, 2)
  ) %>%
  kableExtra::kbl(
    caption = "Table 8. Inventory Stock Optimization — Top 10 Most Frequently Purchased Items",
    align = c("l","l","r","r","r","r","r")
  ) %>%
  kableExtra::kable_styling(full_width = FALSE, bootstrap_options = c("striped","hover"))
Table 8. Inventory Stock Optimization — Top 10 Most Frequently Purchased Items
SKU Description Mean Daily Demand Daily SD Safety Stock Reorder Point EOQ (units)
17003 BROCADE RING PURSE 124.69 254.46 1107 1980 1508
21212 PACK OF 72 RETROSPOT CAKE CASES 93.35 121.02 527 1180 1508
21977 PACK OF 60 PINK PAISLEY CAKE CASES 65.51 120.62 525 983 1508
23166 MEDIUM CERAMIC TOP STORAGE JAR 24.76 41.32 180 353 1508
23843 PAPER CRAFT , LITTLE BIRDIE 80995.00 NA NA NA 1508
84077 WORLD WAR 2 GLIDERS ASSTD DESIGNS 239.03 566.44 2465 4138 1508
84879 ASSORTED COLOUR BIRD ORNAMENT 148.60 173.45 755 1795 1508
84991 60 TEATIME FAIRY CAKE CASES 75.70 119.47 520 1050 1508
85099B JUMBO BAG RED RETROSPOT 194.88 248.15 1080 2444 1508
85123A CREAM HANGING HEART T-LIGHT HOLDER 28.50 38.89 169 369 1508
85123A WHITE HANGING HEART T-LIGHT HOLDER 117.86 160.82 700 1525 1508