This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# -----------------------------
# Packages installation & loading
# -----------------------------
packages <- c("xgboost", "Metrics", "ggplot2", "data.table",
"lubridate", "DT", "dplyr")
for (pkg in packages) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg, repos = "https://cloud.r-project.org/")
library(pkg, character.only = TRUE)
}
}
## Loading required package: xgboost
## Loading required package: Metrics
## Loading required package: ggplot2
## Loading required package: data.table
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: DT
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# -----------------------------
# Read packages into Library
# -----------------------------
library(xgboost)
library(Metrics)
library(ggplot2)
library(data.table)
library(lubridate)
library(DT)
library(dplyr)
library(readxl)
url <- "https://raw.githubusercontent.com/obuczkipp/demand_forecast_sap_data/main/MZ-TG-Y120_EXPORT.xlsx"
temp <- tempfile(fileext = ".xlsx")
download.file(url, temp, mode = "wb")
MZ_TG_Y120_EXPORT <- read_excel(temp)
## New names:
## • `Item` -> `Item...2`
## • `Reference Document` -> `Reference Document...24`
## • `Reference Item` -> `Reference Item...25`
## • `Division` -> `Division...40`
## • `Statistical Value` -> `Statistical Value...56`
## • `Division` -> `Division...68`
## • `Material Group 1` -> `Material Group 1...93`
## • `Material Group 2` -> `Material Group 2...94`
## • `Credit Price` -> `Credit Price...105`
## • `Usage` -> `Usage...113`
## • `SD Document Category` -> `SD Document Category...127`
## • `Item` -> `Item...130`
## • `Credit Price` -> `Credit Price...139`
## • `Region` -> `Region...143`
## • `Tax Code` -> `Tax Code...150`
## • `Translation Date` -> `Translation Date...155`
## • `Material Group 1` -> `Material Group 1...156`
## • `Material Group 2` -> `Material Group 2...157`
## • `Usage` -> `Usage...163`
## • `Tax Code` -> `Tax Code...167`
## • `Reference Document` -> `Reference Document...175`
## • `Reference Item` -> `Reference Item...176`
## • `Translation Date` -> `Translation Date...191`
## • `Statistical Value` -> `Statistical Value...201`
## • `Tax Code` -> `Tax Code...205`
## • `SD Document Category` -> `SD Document Category...236`
## • `Region` -> `Region...243`
## • `Company Code` -> `Company Code...251`
## • `Opertn Task List No.` -> `Opertn Task List No....268`
## • `Counter` -> `Counter...269`
## • `Company Code` -> `Company Code...275`
## • `Item` -> `Item...277`
## • `Opertn Task List No.` -> `Opertn Task List No....300`
## • `Counter` -> `Counter...301`
# -----------------------------
# Prepare data
# -----------------------------
daily_full <- as.data.table(MZ_TG_Y120_EXPORT)
# Rename columns: instead of spaces, use underscores
colnames(daily_full) <- gsub(" ", "_", colnames(daily_full))
# Created_On Date format
daily_full[, Created_On := as.Date(Created_On)]
setorder(daily_full, Material, Created_On)
str(daily_full)
## Classes 'data.table' and 'data.frame': 1599 obs. of 306 variables:
## $ Billing_Document : chr "90000000" "90000001" "90000002" "90000003" ...
## $ Item...2 : chr "10" "10" "10" "10" ...
## $ Higher-Level_Item : chr "0" "0" "0" "0" ...
## $ Invoiced_Quantity : num 131 134 57 21 227 186 5 39 40 30 ...
## $ Sales_Unit : chr "PC" "PC" "PC" "PC" ...
## $ Numerator : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Denominator : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Base_Unit_of_Measure : chr "PC" "PC" "PC" "PC" ...
## $ Scale_Quantity : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Billing_Qty_in_SKU : num 131 134 57 21 227 186 5 39 40 30 ...
## $ Required_Quantity : num 131 134 57 21 227 186 5 39 40 30 ...
## $ Net_Weight : num 117.9 120.6 51.3 18.9 204.3 ...
## $ Gross_Weight : num 131 134 57 21 227 186 5 39 40 30 ...
## $ Unit_of_Weight : chr "G" "G" "G" "G" ...
## $ Volume : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Volume_Unit : logi NA NA NA NA NA NA ...
## $ Business_Area : logi NA NA NA NA NA NA ...
## $ Pricing_Date : POSIXct, format: "2017-10-08" "2017-10-08" ...
## $ Serv._Rendered_Date : POSIXct, format: "2017-10-08" "2017-10-08" ...
## $ Exchange_Rate : chr "1,00000" "1,00000" "1,00000" "1,00000" ...
## $ Net_Value : num 9170 9380 3990 1470 15890 ...
## $ Originating_Document : logi NA NA NA NA NA NA ...
## $ Originating_Item : chr "0" "0" "0" "0" ...
## $ Reference_Document...24 : chr "80000000" "80000001" "80000002" "80000003" ...
## $ Reference_Item...25 : chr "10" "10" "10" "10" ...
## $ Preceding_Doc.Categ. : chr "J" "J" "J" "J" ...
## $ Sales_Document : chr "22" "23" "24" "25" ...
## $ Sales_Document_Item : chr "10" "10" "10" "10" ...
## $ Referenced_Sales_Document : chr NA NA NA NA ...
## $ Material : chr "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" ...
## $ Item_Description : chr "Y120 Bike" "Y120 Bike" "Y120 Bike" "Y120 Bike" ...
## $ Pricing_Ref._Matl : logi NA NA NA NA NA NA ...
## $ Batch : logi NA NA NA NA NA NA ...
## $ Material_Group : chr "ZYOUTH" "ZYOUTH" "ZYOUTH" "ZYOUTH" ...
## $ Item_Category : chr "TAN" "TAN" "TAN" "TAN" ...
## $ Item_Type : logi NA NA NA NA NA NA ...
## $ Product_Hierarchy : logi NA NA NA NA NA NA ...
## $ Shipping_Point/Receiving_Pt : chr "1710" "1710" "1710" "1710" ...
## $ Replacement_Part : logi NA NA NA NA NA NA ...
## $ Division...40 : chr "00" "00" "00" "00" ...
## $ Partner_Item : chr "10" "10" "10" "10" ...
## $ Plant : chr "1710" "1710" "1710" "1710" ...
## $ Departure_Ctry/Reg. : chr "US" "US" "US" "US" ...
## $ Region_of_Dlv._Plant : chr "CA" "CA" "CA" "CA" ...
## $ County_of_Dlv.Plant : logi NA NA NA NA NA NA ...
## $ City_of_Deliv._Plant : logi NA NA NA NA NA NA ...
## $ Tax_Class._Material : chr "0" "0" "0" "0" ...
## $ Tax_Class.2_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.3_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.4_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.5_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.6_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.7_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.8_Material : logi NA NA NA NA NA NA ...
## $ Tax_Class.9_Material : logi NA NA NA NA NA NA ...
## $ Statistical_Value...56 : logi NA NA NA NA NA NA ...
## $ Pricing_Relevance : chr "X" "X" "X" "X" ...
## $ Cash_Discount : chr "X" "X" "X" "X" ...
## $ Cash_Disc._Bas. : num 9170 9380 3990 1470 15890 ...
## $ Material_Price_Grp : logi NA NA NA NA NA NA ...
## $ Acct_Assmt_Grp_Mat. : chr "01" "01" "01" "01" ...
## $ Cost_Center : logi NA NA NA NA NA NA ...
## $ Volume_Rebate_Group : logi NA NA NA NA NA NA ...
## $ Commission_Group : logi NA NA NA NA NA NA ...
## $ EAN_Number : logi NA NA NA NA NA NA ...
## $ Sales_Group : logi NA NA NA NA NA NA ...
## $ Sales_Office : logi NA NA NA NA NA NA ...
## $ Division...68 : chr "00" "00" "00" "00" ...
## $ Returns : logi NA NA NA NA NA NA ...
## $ Created_By : chr "BPINST" "BPINST" "BPINST" "BPINST" ...
## $ Created_On : Date, format: "2017-10-09" "2017-10-09" ...
## $ Time : POSIXct, format: "1899-12-31 00:26:52" "1899-12-31 00:27:06" ...
## $ Valuation_Type : logi NA NA NA NA NA NA ...
## $ Storage_Location : chr "171A" "171A" "171A" "171A" ...
## $ Update_Group_(stats) : logi NA NA NA NA NA NA ...
## $ Cost : num 5353 5475 2329 858 9275 ...
## $ Subtotal_1 : num 9170 9380 3990 1470 15890 ...
## $ Subtotal_2 : num 9170 9380 3990 1470 15890 ...
## $ Subtotal_3 : num 9170 9380 3990 1470 15890 ...
## $ Subtotal_4 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Subtotal_5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Subtotal_6 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Exchange_Rate_Stats. : chr "1,00000" "1,00000" "1,00000" "1,00000" ...
## $ Pricing : logi NA NA NA NA NA NA ...
## $ General : logi NA NA NA NA NA NA ...
## $ EAN/UPC : logi NA NA NA NA NA NA ...
## $ Profit_Center : chr "US10_TGY1" "US10_TGY1" "US10_TGY1" "US10_TGY1" ...
## $ Customer_Group_1 : logi NA NA NA NA NA NA ...
## $ Customer_Group_2 : logi NA NA NA NA NA NA ...
## $ Customer_Group_3 : logi NA NA NA NA NA NA ...
## $ Customer_Group_4 : logi NA NA NA NA NA NA ...
## $ Customer_Group_5 : logi NA NA NA NA NA NA ...
## $ Material_Group_1...93 : logi NA NA NA NA NA NA ...
## $ Material_Group_2...94 : logi NA NA NA NA NA NA ...
## $ Material_Group_3 : logi NA NA NA NA NA NA ...
## $ Material_Group_4 : logi NA NA NA NA NA NA ...
## $ Material_Group_5 : logi NA NA NA NA NA NA ...
## $ Material_Entered : chr "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" "MZ-TG-Y120" ...
## $ Rebate_Basis : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
## - attr(*, ".internal.selfref")=<externalptr>
all_dates <- daily_full[, .(Date = seq(min(Created_On), max(Created_On), by="day")), by = Material]
daily_full <- merge(all_dates, daily_full, by.x = c("Material", "Date"), by.y = c("Material", "Created_On"), all.x = TRUE)
daily_full[, Invoiced_Quantity := ifelse(is.na(Invoiced_Quantity),
mean(Invoiced_Quantity, na.rm = TRUE),
Invoiced_Quantity),
by = Material]
daily_full[, `:=` (
Q1 = quantile(Invoiced_Quantity, 0.25, na.rm = TRUE),
Q3 = quantile(Invoiced_Quantity, 0.75, na.rm = TRUE)
), by = Material]
daily_full[, IQR := Q3 - Q1]
daily_full[, Invoiced_Quantity := pmin(pmax(Invoiced_Quantity, Q1 - 1.5*IQR), Q3 + 1.5*IQR)]
daily_full[, c("Q1","Q3","IQR") := NULL] # delete temp columns
setorder(daily_full, Material, Date)
lag_days <- 40 # 40 days back
daily_full[, paste0("lag_", 1:lag_days) := shift(Invoiced_Quantity, 1:lag_days), by = Material]
daily_full[, ma_lag := frollmean(Invoiced_Quantity, lag_days, na.rm = TRUE), by = Material]
lag_cols <- paste0("lag_", 1:lag_days)
train_data <- daily_full[complete.cases(daily_full[, c(lag_cols, "ma_lag"), with=FALSE])]
X_train <- as.matrix(train_data[, c(lag_cols, "ma_lag"), with=FALSE])
y_train <- train_data$Invoiced_Quantity
dtrain <- xgb.DMatrix(data = X_train, label = y_train)
params <- list(
objective = "reg:squarederror", # regression task
eval_metric = "rmse",
max_depth = 7, # tree depth
eta = 0.05, # learning rate
subsample = 0.8, # row sampling
colsample_bytree = 0.8 # column sampling
)
model <- xgb.train(
params = params,
data = dtrain,
nrounds = 500, # number of boosting rounds
verbose = 1 # print progress every 10 rounds
)
X_all <- as.matrix(daily_full[, c(lag_cols, "ma_lag"), with=FALSE])
daily_full[, pred := predict(model, xgb.DMatrix(X_all))] # predict for all rows (NA for those without lag features)
rmse_score <- rmse(daily_full$Invoiced_Quantity, daily_full$pred)
cat("RMSE (deep model, outlier-trimmed):", rmse_score, "\n")
## RMSE (deep model, outlier-trimmed): 7.978371
ggplot(daily_full, aes(x=Date)) +
geom_line(aes(y=Invoiced_Quantity, color="Actual")) +
geom_line(aes(y=pred, color="Predicted")) +
facet_wrap(~Material, scales="free_y") +
labs(title="Daily Demand: Actual vs Predicted (Deep XGBoost, Outlier Trimmed)",
y="Quantity",
color="Legend") +
theme_minimal()
# daily_full table (Material, Date, Invoiced_Quantity, pred)
dt_data <- daily_full[, .(Material, Date, Invoiced_Quantity, pred)]
# Interaktiv table
datatable(
dt_data,
options = list(
pageLength = 100
, # number of rows per page
autoWidth = TRUE, # weight columns based on content
scrollX = TRUE # scroll horizontally if table is wider than container
),
rownames = FALSE,
filter = "top" # search box for each column
)