Forecasting

Purpose

This script shows how I loop through three types of forecasting models in order to predict unit sales for over 850 products. The three models are random walk, the entire family of exponential smoothing models, and multiple ARIMA models.

Load libraries

library(readxl)
library(tools)
library(gtools)
library(forecast)
library(dplyr)
library(lubridate)
library(openxlsx)
library(stringr)

Script settings

excel_write <- 'no'
forecast_steps <- 4

read_path <- '/Users/jarad/Fake\ Folder/Fab/Projects/War\ Chest/CSVs/'
write_path <- '/Users/jarad/Desktop/Desktop/Portfolio\ Scripts/Unit\ Sales\ Forecasts\ for\ 800\ Products/'

Get data

Using a Jupyter Notebook, the data is pulled with SQL, and cleaned and structured with Python. Then I create a CSV of structured data and read it into RStudio.

The data is historical part sales, divided into two departments (“dept A” and “dept B”), and further divided into popularity tiers, which are:
* Key: Products we sell the most of AND make the most money on
* High: Excluding the parts above, the top 25% by unit sales
* Med: Next 25%
* Low: Bottom 50%
* Not Enough Data: Popularity tiers are based on the most recent three months of data, so if a product has fewer than three months of data, I cannot determine its popularity, and I label it as “not enough data”.

I do this so each department can choose where and when to focus their resources, first starting with Key, then moving on to High, etc., skipping Low if they need to.

# excluded parts are products we don't sell to normal customers, are discontinued, part of other products, etc.
excluded <- read_excel(paste(read_path,'Excluded\ Parts.xlsx', sep = ''))
colnames(excluded) <- str_replace_all(colnames(excluded), ' ', '_')

# read in structured data
main <- read_excel(paste(read_path,'War\ Chest\ Part\ sales\ for\ R.xlsx', sep = ''))
colnames(main) <- str_replace_all(colnames(main), ' ', '_')
main$year_and_month <- as.Date(main$year_and_month)

# mask the department names
bom_fix <- function(x){
  if(x == 'pnp'){
    ret <- 'dept A'
    } else{
      ret <- 'dept B'
      }
  return(ret)
}

main$bom_type <- sapply(main$bom_type, bom_fix)
excluded$bom_type <- sapply(excluded$bom_type, bom_fix)

Scramble data

As a courtesy to my company, I’ll scramble the part IDs and product names.

set.seed(1)
n <- sample(1:100, 1)

main$part_id <- as.integer(main$part_id * n )
excluded$part_id <- as.integer(excluded$part_id * n )

scramble <- function(x){
  set.seed(1)
  s <- paste(sample(letters, nchar(x), replace = TRUE), collapse = '')
  new_word <- paste('product: ', s, sep = '')
  return(new_word)
}

main$products_name <- sapply(main$products_name, scramble)
excluded$products_name <- sapply(excluded$products_name, scramble)

Check out unique part count

l <- length(unique(main$part_id))
print(paste('the unique part count is', l))

## [1] "the unique part count is 852"

Define models

I will define three models: Naive, ARIMA, and Exponential Smoothing. I’ll then send each part through each model, choosing the best model per part based on the RMSE (root mean squared error).

# Random Walk, or ARIMA(0,0,0)(0,1,0), or Naive (different names for the same model)
# use this as the baseline model
naive_func <- function(t1, forecast_steps){
  model <- rwf(t1, h = forecast_steps, level = c(95))
  model_counter <- 1
  ret = list('model' = model, 'model_counter' = model_counter)
  return(ret)
}

# ARIMA
arima_func <- function(t1, forecast_steps){
  trace <- capture.output({
    m1 <- auto.arima(t1, trace = TRUE, approximation = FALSE)
    model <- forecast(m1, h = forecast_steps, level = c(95))
  })
con    <- textConnection(trace)
models <- read.table(con, sep = ":")
model_counter <- length(models[,1]) - 1
close(con)
ret <- list('model' = model, 'model_counter' = model_counter)
  return(ret)
}

# Exponential Smoothing
# loop through all types of exponential smoothing models (simple, double, triple) and choose the best one based on the RMSE
holt_func <- function(t1, forecast_steps){
  model_counter <- 0
  sse <- Inf
  for (w in list('additive','multiplicative')){
    for (alpha in list(TRUE,FALSE)){
      for (beta in list(TRUE,FALSE)){
        for (gamma in list(TRUE,FALSE)){
          tryCatch({
            model_counter <- model_counter + 1
            m1 <- HoltWinters(t1, season = c(w), alpha = alpha, beta = beta, gamma = gamma)
            if (m1$SSE < sse){
              sse <- m1$SSE
              model <- forecast(m1, h = forecast_steps, level = c(95))
            }
          }, error = function(e){})
        }
      }
    }
  }
  ret = list('model' = model, 'model_counter' = model_counter)
  return(ret)
}

Prepare for model building

# get a list of all unique part numbers
all_pns <- unique(main$part_id)

# get dates for desired forecasts
d <- max(main$year_and_month)
d <- as.Date(d) + months(1)
ds <- seq(as.Date(d), by = 'month', length.out = forecast_steps)

# get just year_and_month
d1 <- substring(ds[1], 1,7)
d2 <- substring(ds[length(ds)], 1,7)

Put each part through each model

Choose the best model, per part, based on the RMSE

start <- Sys.time()

# create empty df to store results
final_models <- data.frame(matrix(nrow = 0, ncol = 8))

# prepare counters and lists
total_models <- 0
total_pns <- 0
final_models <- list()

models <- c(naive_func,
            arima_func,
            holt_func)

# loop through all parts (or "PNs", i.e., "product numbers")
for (pn in all_pns){
  total_pns <- total_pns + 1
  model_counter <- 0
  
  # prepare each part for model building; split into training and test sets
  df <- subset(main, part_id == pn)

  n = ceiling(nrow(df) * 0.80)
  train_data <- df[1:n,]
  test_data <- df[(n+1):nrow(df),]
  
  train <- train_data$qty_total
  test <- test_data$qty_total
  
  steps <- length(test)
  RMSE <- Inf
  
  tryCatch({
  # loop through each model and choose the best
  for (i in 1:length(models)){
    m <- models[[i]]
    
    # fit the model to the training set, forecast out as many months as there are in the test set
    fit <- m(train, steps)
    total_models <- total_models + fit$model_counter
    
    # get the RMSE of the model over the test set
    rmse <- accuracy(fit$model, test)[4]
    
    # if the RMSE is the lowest then choose that model
    if (rmse < RMSE){
      RMSE <- rmse
      m2 <- m
      }
    }
  }, error = function(e){})
  
  # send all data through the best model and forecast over the unknown future
  f <- m2(df$qty_total, forecast_steps)
  
  # construct dataframe of forecasts for each part
  model_used <- f$model$method
  mape <- accuracy(f$model)[5]
  mae <- accuracy(f$model)[3]  
  
  df2 <- as.data.frame(f$model)
  
  colnames(df2)[colnames(df2) == 'Lo 95'] <- 'lower'
  df2$lower[df2$lower < 0] <- 0
  colnames(df2)[colnames(df2) == 'Point Forecast'] <- 'avg'
  colnames(df2)[colnames(df2) == 'Hi 95'] <- 'upper'
  
  for (c in c('lower','avg','upper')) {
    df2[c] <- round(df2[c])
  }
  
  df2$part_id <- pn
  df2$forecast_month <- ds
  df2$model_used <- model_used
  df2$mape <- (round(mape,2))/100
  df2$mae <- round(mae)
  
  # join each product's dataframe with the rest of the products
  final_models <- rbind(final_models, df2)
}

end <- Sys.time()
end - start

## Time difference of 2.107939 mins

print(paste(length(unique(final_models$part_id)),
          'products,',
          round(total_models/length(unique(final_models$part_id))),
          'models per product on avg'))

## [1] "852 products, 26 models per product on avg"

Check it

Make sure I make a model for every part

v1 <- length(unique(final_models$part_id))
v2 <- length(all_pns)

if (v1 != v2){
  stop('mismatch')
}

Check out the MAPE and MAE over all models for all parts

What is MAPE and MAE: The MAPE is the “mean absolute percentage error” and the MAE is the “mean absolute error”. Both metrics are used to evaluate how good a forecast model is. A MAPE of, say, 30% means that each monthly forecast differs from the actual value by 30%. A MAE of, say, 100 means that each monthly forecast differs from the actual value by +/- 100 units.

Remarks on the MAPE: The MAPE has its drawbacks; it puts a heavier penalty on negative errors, as explained here: https://en.wikipedia.org/wiki/Mean_absolute_percentage_error#Issues, and if a forecast is too high, there will be no upper limit, which can confuse people.

For these reasons, I like to use the MAE. The MAE is also easier to explain and is more intuitive. At any rate, I programmatically created roughly 26 models per part, and had R choose the best one, so whatever I have here and now is as good as it’s going to get.

Summary of output: The median MAPE could be lower (it’s ~74%), but the median MAE is only ~25, meaning each model for each part for each forecast month is off by +/- 25 units, which is pretty good considering I created over 22,100 models!

summary(final_models[,c('mae','mape')])

##       mae               mape          
##  Min.   :   0.00   Min.   :   0.1163  
##  1st Qu.:  10.00   1st Qu.:   0.3679  
##  Median :  24.50   Median :   0.7420  
##  Mean   :  40.13   Mean   :  11.4908  
##  3rd Qu.:  48.00   3rd Qu.:   4.0768  
##  Max.   :1153.00   Max.   :1641.0091

Prep part data for join

I’ll merge the forecasts with part data so that I can create some Excel workbooks to share with the company.

ls <- colnames(main)
drop <- c('year_and_month',
          'qty_total',
          'sku_id',
          'sku_date_modified')
ls <- ls[!ls %in% drop]

part_data <- main[ls]
part_data <- unique(part_data)

Join part data to final models

final_models <- merge(final_models, part_data, by = 'part_id', all.x = TRUE)

Start building Excel workbook

Create one workbook per department (“dept A” and “dept B”), where each workbook has tabs that go like this:
* Tab01: Model Summary for each part (the model that was used along with the model accuracy) * Tab02 - 06: Forecasts per popularity tier per part * Tab07: Parts that are excluded, for reference.

boms <- unique(final_models$bom_type)
pops <- pops <- final_models$popularity_tier %>% 
        unique() %>% 
        mixedsort()
dates <- unique(final_models$forecast_month)

for (b in boms){
  
  if (b == 'dept A') {
    title <- paste('Dept A Forecasts for ',d1,' to ',d2, sep = '')
  } else {
    title <- paste('Dept B Forecasts for ',d1,' to ',d2, sep = '')
  }
  
#===========================================================================
# Tab01: Model Summaries
#===========================================================================
  
  cols <- c('part_id',
            'mape',
            'mae',
            'model_used',
            'bom_type',
            'popularity_tier',
            'outsourced')
  
  df <- unique(final_models[cols])
  df <- arrange(df, popularity_tier, part_id)
  df <- subset(df, bom_type == b)
  colnames(df) <- str_replace_all(colnames(df), '_', ' ')
  
  worksheet_ls <- list('Model Summaries' = df)
  
#===========================================================================
# Tab02 - 06: Forecasts per part per month, where each popularity tier gets its own worksheet
#===========================================================================
  
  cols <- c('part_id',
            'bom_type',
            'popularity_tier',
            'forecast_month',
            'lower',
            'avg',
            'upper',
            'outsourced')
  
  df <- final_models[cols]
  df <- subset(df, bom_type == b)
  df <- arrange(df, popularity_tier, part_id)
  
  df$forecast_month <- substr(df$forecast_month, 1, 7)

  # segment by popularity tier
  for (p in pops) {
    df2 <- subset(df, popularity_tier == p)
    colnames(df2) <- str_replace_all(colnames(df2), '_', ' ')
    
    worksheet_ls[[p]] <- df2
  }
  
#===========================================================================
# Tab07: Parts Excluded
#===========================================================================
  
  worksheet_ls[['Parts Excluded']] <- subset(excluded, bom_type == b)
  
#===========================================================================
# Make everything look nicer
#===========================================================================
  
  name <- names(worksheet_ls)
  i <- 1
  
  for (w in worksheet_ls) {
    colnames(w) <- str_replace_all(colnames(w), '_',' ')
    colnames(w) <- colnames(w) %>% toTitleCase()
    
    for (c in colnames(w)) {
      if (class(w[[c]]) == 'character') {
        w[[c]] <- w[[c]] %>% toTitleCase()
      }
    }
    
    worksheet_ls[[name[i]]] <- w
    i <- i + 1
    
  }  
  
#===========================================================================
# Write it all to excel
#===========================================================================
  
  if (excel_write == 'yes') {
    write.xlsx(worksheet_ls, file = paste(write_path, title,'.xlsx', sep = ''))  
  }
}

print('done')

## [1] "done"