R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# -----------------------------
# Packages installation & loading
# -----------------------------
packages <- c("xgboost", "Metrics", "ggplot2", "data.table",
              "lubridate", "DT", "dplyr")

for (pkg in packages) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, repos = "https://cloud.r-project.org/")
    library(pkg, character.only = TRUE)
  }
}
## Loading required package: xgboost
## Loading required package: Metrics
## Loading required package: ggplot2
## Loading required package: data.table
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: DT
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# -----------------------------
# Read packages into Library
# -----------------------------
library(xgboost)
library(Metrics)
library(ggplot2)
library(data.table)
library(lubridate)
library(DT)
library(dplyr)

read dataset from GitHub

library(readxl)

url <- "https://raw.githubusercontent.com/obuczkipp/demand_forecast_sap_data/main/MZ-TG-Y120_EXPORT.xlsx"

temp <- tempfile(fileext = ".xlsx")

download.file(url, temp, mode = "wb")

MZ_TG_Y120_EXPORT <- read_excel(temp)
## New names:
## • `Item` -> `Item...2`
## • `Reference Document` -> `Reference Document...24`
## • `Reference Item` -> `Reference Item...25`
## • `Division` -> `Division...40`
## • `Statistical Value` -> `Statistical Value...56`
## • `Division` -> `Division...68`
## • `Material Group 1` -> `Material Group 1...93`
## • `Material Group 2` -> `Material Group 2...94`
## • `Credit Price` -> `Credit Price...105`
## • `Usage` -> `Usage...113`
## • `SD Document Category` -> `SD Document Category...127`
## • `Item` -> `Item...130`
## • `Credit Price` -> `Credit Price...139`
## • `Region` -> `Region...143`
## • `Tax Code` -> `Tax Code...150`
## • `Translation Date` -> `Translation Date...155`
## • `Material Group 1` -> `Material Group 1...156`
## • `Material Group 2` -> `Material Group 2...157`
## • `Usage` -> `Usage...163`
## • `Tax Code` -> `Tax Code...167`
## • `Reference Document` -> `Reference Document...175`
## • `Reference Item` -> `Reference Item...176`
## • `Translation Date` -> `Translation Date...191`
## • `Statistical Value` -> `Statistical Value...201`
## • `Tax Code` -> `Tax Code...205`
## • `SD Document Category` -> `SD Document Category...236`
## • `Region` -> `Region...243`
## • `Company Code` -> `Company Code...251`
## • `Opertn Task List No.` -> `Opertn Task List No....268`
## • `Counter` -> `Counter...269`
## • `Company Code` -> `Company Code...275`
## • `Item` -> `Item...277`
## • `Opertn Task List No.` -> `Opertn Task List No....300`
## • `Counter` -> `Counter...301`

read SAP data into R

# -----------------------------
# Prepare data
# -----------------------------


daily_full <- as.data.table(MZ_TG_Y120_EXPORT)

# Rename columns: instead of spaces, use underscores
colnames(daily_full) <- gsub(" ", "_", colnames(daily_full))

# Created_On Date format
daily_full[, Created_On := as.Date(Created_On)]
setorder(daily_full, Material, Created_On)

Structure of the dataset

str(daily_full)
## Classes 'data.table' and 'data.frame':   1599 obs. of  306 variables:
##  $ Billing_Document                        : chr  "90000000" "90000001" "90000002" "90000003" ...
##  $ Item...2                                : chr  "10" "10" "10" "10" ...
##  $ Higher-Level_Item                       : chr  "0" "0" "0" "0" ...
##  $ Invoiced_Quantity                       : num  131 134 57 21 227 186 5 39 40 30 ...
##  $ Sales_Unit                              : chr  "PC" "PC" "PC" "PC" ...
##  $ Numerator                               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Denominator                             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Base_Unit_of_Measure                    : chr  "PC" "PC" "PC" "PC" ...
##  $ Scale_Quantity                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Billing_Qty_in_SKU                      : num  131 134 57 21 227 186 5 39 40 30 ...
##  $ Required_Quantity                       : num  131 134 57 21 227 186 5 39 40 30 ...
##  $ Net_Weight                              : num  117.9 120.6 51.3 18.9 204.3 ...
##  $ Gross_Weight                            : num  131 134 57 21 227 186 5 39 40 30 ...
##  $ Unit_of_Weight                          : chr  "G" "G" "G" "G" ...
##  $ Volume                                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Volume_Unit                             : logi  NA NA NA NA NA NA ...
##  $ Business_Area                           : logi  NA NA NA NA NA NA ...
##  $ Pricing_Date                            : POSIXct, format: "2017-10-08" "2017-10-08" ...
##  $ Serv._Rendered_Date                     : POSIXct, format: "2017-10-08" "2017-10-08" ...
##  $ Exchange_Rate                           : chr  "1,00000" "1,00000" "1,00000" "1,00000" ...
##  $ Net_Value                               : num  9170 9380 3990 1470 15890 ...
##  $ Originating_Document                    : logi  NA NA NA NA NA NA ...
##  $ Originating_Item                        : chr  "0" "0" "0" "0" ...
##  $ Reference_Document...24                 : chr  "80000000" "80000001" "80000002" "80000003" ...
##  $ Reference_Item...25                     : chr  "10" "10" "10" "10" ...
##  $ Preceding_Doc.Categ.                    : chr  "J" "J" "J" "J" ...
##  $ Sales_Document                          : chr  "22" "23" "24" "25" ...
##  $ Sales_Document_Item                     : chr  "10" "10" "10" "10" ...
##  $ Referenced_Sales_Document               : chr  NA NA NA NA ...
##  $ Material                                : chr  "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" ...
##  $ Item_Description                        : chr  "Y120 Bike" "Y120 Bike" "Y120 Bike" "Y120 Bike" ...
##  $ Pricing_Ref._Matl                       : logi  NA NA NA NA NA NA ...
##  $ Batch                                   : logi  NA NA NA NA NA NA ...
##  $ Material_Group                          : chr  "ZYOUTH" "ZYOUTH" "ZYOUTH" "ZYOUTH" ...
##  $ Item_Category                           : chr  "TAN" "TAN" "TAN" "TAN" ...
##  $ Item_Type                               : logi  NA NA NA NA NA NA ...
##  $ Product_Hierarchy                       : logi  NA NA NA NA NA NA ...
##  $ Shipping_Point/Receiving_Pt             : chr  "1710" "1710" "1710" "1710" ...
##  $ Replacement_Part                        : logi  NA NA NA NA NA NA ...
##  $ Division...40                           : chr  "00" "00" "00" "00" ...
##  $ Partner_Item                            : chr  "10" "10" "10" "10" ...
##  $ Plant                                   : chr  "1710" "1710" "1710" "1710" ...
##  $ Departure_Ctry/Reg.                     : chr  "US" "US" "US" "US" ...
##  $ Region_of_Dlv._Plant                    : chr  "CA" "CA" "CA" "CA" ...
##  $ County_of_Dlv.Plant                     : logi  NA NA NA NA NA NA ...
##  $ City_of_Deliv._Plant                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class._Material                     : chr  "0" "0" "0" "0" ...
##  $ Tax_Class.2_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.3_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.4_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.5_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.6_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.7_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.8_Material                    : logi  NA NA NA NA NA NA ...
##  $ Tax_Class.9_Material                    : logi  NA NA NA NA NA NA ...
##  $ Statistical_Value...56                  : logi  NA NA NA NA NA NA ...
##  $ Pricing_Relevance                       : chr  "X" "X" "X" "X" ...
##  $ Cash_Discount                           : chr  "X" "X" "X" "X" ...
##  $ Cash_Disc._Bas.                         : num  9170 9380 3990 1470 15890 ...
##  $ Material_Price_Grp                      : logi  NA NA NA NA NA NA ...
##  $ Acct_Assmt_Grp_Mat.                     : chr  "01" "01" "01" "01" ...
##  $ Cost_Center                             : logi  NA NA NA NA NA NA ...
##  $ Volume_Rebate_Group                     : logi  NA NA NA NA NA NA ...
##  $ Commission_Group                        : logi  NA NA NA NA NA NA ...
##  $ EAN_Number                              : logi  NA NA NA NA NA NA ...
##  $ Sales_Group                             : logi  NA NA NA NA NA NA ...
##  $ Sales_Office                            : logi  NA NA NA NA NA NA ...
##  $ Division...68                           : chr  "00" "00" "00" "00" ...
##  $ Returns                                 : logi  NA NA NA NA NA NA ...
##  $ Created_By                              : chr  "BPINST" "BPINST" "BPINST" "BPINST" ...
##  $ Created_On                              : Date, format: "2017-10-09" "2017-10-09" ...
##  $ Time                                    : POSIXct, format: "1899-12-31 00:26:52" "1899-12-31 00:27:06" ...
##  $ Valuation_Type                          : logi  NA NA NA NA NA NA ...
##  $ Storage_Location                        : chr  "171A" "171A" "171A" "171A" ...
##  $ Update_Group_(stats)                    : logi  NA NA NA NA NA NA ...
##  $ Cost                                    : num  5353 5475 2329 858 9275 ...
##  $ Subtotal_1                              : num  9170 9380 3990 1470 15890 ...
##  $ Subtotal_2                              : num  9170 9380 3990 1470 15890 ...
##  $ Subtotal_3                              : num  9170 9380 3990 1470 15890 ...
##  $ Subtotal_4                              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subtotal_5                              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subtotal_6                              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Exchange_Rate_Stats.                    : chr  "1,00000" "1,00000" "1,00000" "1,00000" ...
##  $ Pricing                                 : logi  NA NA NA NA NA NA ...
##  $ General                                 : logi  NA NA NA NA NA NA ...
##  $ EAN/UPC                                 : logi  NA NA NA NA NA NA ...
##  $ Profit_Center                           : chr  "US10_TGY1" "US10_TGY1" "US10_TGY1" "US10_TGY1" ...
##  $ Customer_Group_1                        : logi  NA NA NA NA NA NA ...
##  $ Customer_Group_2                        : logi  NA NA NA NA NA NA ...
##  $ Customer_Group_3                        : logi  NA NA NA NA NA NA ...
##  $ Customer_Group_4                        : logi  NA NA NA NA NA NA ...
##  $ Customer_Group_5                        : logi  NA NA NA NA NA NA ...
##  $ Material_Group_1...93                   : logi  NA NA NA NA NA NA ...
##  $ Material_Group_2...94                   : logi  NA NA NA NA NA NA ...
##  $ Material_Group_3                        : logi  NA NA NA NA NA NA ...
##  $ Material_Group_4                        : logi  NA NA NA NA NA NA ...
##  $ Material_Group_5                        : logi  NA NA NA NA NA NA ...
##  $ Material_Entered                        : chr  "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" ...
##  $ Rebate_Basis                            : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
##  - attr(*, ".internal.selfref")=<externalptr>

Daily Rows for each Material (fill missing dates)

—————————–

all_dates <- daily_full[, .(Date = seq(min(Created_On), max(Created_On), by="day")), by = Material]
daily_full <- merge(all_dates, daily_full, by.x = c("Material", "Date"), by.y = c("Material", "Created_On"), all.x = TRUE)

—————————–

Missing Sales Qty fullfill with Material-wise mean

—————————–

daily_full[, Invoiced_Quantity := ifelse(is.na(Invoiced_Quantity),
                                         mean(Invoiced_Quantity, na.rm = TRUE),
                                         Invoiced_Quantity),
           by = Material]

—————————–

Handling Outliers: trim outliers (IQR method)

—————————–

daily_full[, `:=` (
  Q1 = quantile(Invoiced_Quantity, 0.25, na.rm = TRUE),
  Q3 = quantile(Invoiced_Quantity, 0.75, na.rm = TRUE)
), by = Material]

daily_full[, IQR := Q3 - Q1]
daily_full[, Invoiced_Quantity := pmin(pmax(Invoiced_Quantity, Q1 - 1.5*IQR), Q3 + 1.5*IQR)]
daily_full[, c("Q1","Q3","IQR") := NULL]  # delete temp columns

—————————–

Lags and frollmean 40 days

—————————–

setorder(daily_full, Material, Date)
lag_days <- 40  # 40 days back
daily_full[, paste0("lag_", 1:lag_days) := shift(Invoiced_Quantity, 1:lag_days), by = Material]
daily_full[, ma_lag := frollmean(Invoiced_Quantity, lag_days, na.rm = TRUE), by = Material]

—————————–

Peapre training data

—————————–

lag_cols <- paste0("lag_", 1:lag_days)
train_data <- daily_full[complete.cases(daily_full[, c(lag_cols, "ma_lag"), with=FALSE])]

X_train <- as.matrix(train_data[, c(lag_cols, "ma_lag"), with=FALSE])
y_train <- train_data$Invoiced_Quantity
dtrain <- xgb.DMatrix(data = X_train, label = y_train)

—————————–

Fitting XGBoost model

—————————–

params <- list(
  objective = "reg:squarederror", # regression task
  eval_metric = "rmse",
  max_depth = 7, # tree depth
  eta = 0.05, # learning rate
  subsample = 0.8, # row sampling
  colsample_bytree = 0.8 # column sampling
)

model <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 500, # number of boosting rounds
  verbose = 1 # print progress every 10 rounds
)

—————————–

Prediction for each Material and Date (using lag features)

—————————–

X_all <- as.matrix(daily_full[, c(lag_cols, "ma_lag"), with=FALSE])
daily_full[, pred := predict(model, xgb.DMatrix(X_all))] # predict for all rows (NA for those without lag features)

—————————–

Root Mean Squared Error shows the accuracy of the model, predicted values are close to actual values, the RMSE will be low. A # lower RMSE indicates a better fit of the model to the data.

—————————–

rmse_score <- rmse(daily_full$Invoiced_Quantity, daily_full$pred)
cat("RMSE (deep model, outlier-trimmed):", rmse_score, "\n")
## RMSE (deep model, outlier-trimmed): 7.978371

—————————–

Plotting actual vs. predicted demand for the Material

—————————–

ggplot(daily_full, aes(x=Date)) +
  geom_line(aes(y=Invoiced_Quantity, color="Actual")) +
  geom_line(aes(y=pred, color="Predicted")) +
  facet_wrap(~Material, scales="free_y") +
  labs(title="Daily Demand: Actual vs Predicted (Deep XGBoost, Outlier Trimmed)",
       y="Quantity",
       color="Legend") +
  theme_minimal()

—————————–

Create DT dashboard

—————————–

# daily_full table (Material, Date, Invoiced_Quantity, pred)
dt_data <- daily_full[, .(Material, Date, Invoiced_Quantity, pred)]

# Interaktiv table 
datatable(
  dt_data,
  options = list(
    pageLength = 100
    ,      # number of rows per page
    autoWidth = TRUE,     # weight columns based on content
    scrollX = TRUE        # scroll horizontally if table is wider than container
  ),
  rownames = FALSE,
  filter = "top"           # search box for each column
)