# knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(mlbench)
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom 1.0.6 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tibble 3.2.1
## ✔ infer 1.0.7 ✔ tune 1.2.1
## ✔ modeldata 1.4.0 ✔ workflows 1.1.4
## ✔ parsnip 1.2.1 ✔ workflowsets 1.1.0
## ✔ purrr 1.0.2 ✔ yardstick 1.3.1
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(DALEX)
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
library(ranger)
library(Rcpp)
##
## Attaching package: 'Rcpp'
## The following object is masked from 'package:rsample':
##
## populate
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(SHAPforxgboost)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
lst_csv = list.files(pattern = '.csv')
# Read all csv files in the directory and save as df with the same name of the file
for (i in lst_csv){
temp_df = read.csv(i)
assign(gsub('.csv','',i),temp_df)}
train_df = train %>%
mutate(date = as.Date(date)) %>%
mutate(weekdays = wday(date)) %>%
mutate(date = as.numeric(factor(date))) %>%
select(c(colnames(test),orders,-id, -holiday_name)) %>%
mutate(warehouse = as.numeric(factor(warehouse)))
test_df = test %>%
mutate(weekdays = wday(date)) %>%
mutate(date = as.numeric(factor(date))) %>%
select(-holiday_name) %>%
mutate(weekdays = wday(date)) %>%
mutate(warehouse = as.numeric(factor(warehouse)))
n = floor(nrow(train_df)*0.8)
train_df <- train_df[order(train_df$date), ]
train = train_df[c(1:n),]
test = train_df[c((n+1):nrow(train_df)),]
dtrain <- xgb.DMatrix(data = as.matrix(train[, -7]), label = train[, 7])
dtest <- xgb.DMatrix(data = as.matrix(test[, -7]), label = test[, 7])
# Custom MAPE eval function
mape_eval <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- mean(abs((labels - preds)/pmax(1, abs(labels))))
return(list(metric = "mape", value = err))
}
params <- list(
booster = "gbtree",
eta = 0.5,
max_depth = 10,
subsample = 0.7,
colsample_bytree = 0.7,
objective = "reg:squarederror"
)
num_round <- 100
watchlist <- list(train = dtrain)
model <- xgb.train(
params = params,
data = dtrain,
nrounds = num_round,
watchlist = watchlist,
eval_metric = mape_eval,
maximize = FALSE,
early_stopping_rounds = 50
)
## [1] train-mape:0.495361
## Will train until train_mape hasn't improved in 50 rounds.
##
## [2] train-mape:0.267383
## [3] train-mape:0.140297
## [4] train-mape:0.082433
## [5] train-mape:0.062820
## [6] train-mape:0.059089
## [7] train-mape:0.054445
## [8] train-mape:0.055547
## [9] train-mape:0.054940
## [10] train-mape:0.052501
## [11] train-mape:0.051276
## [12] train-mape:0.049128
## [13] train-mape:0.045392
## [14] train-mape:0.045255
## [15] train-mape:0.044535
## [16] train-mape:0.043740
## [17] train-mape:0.043592
## [18] train-mape:0.043499
## [19] train-mape:0.042452
## [20] train-mape:0.041294
## [21] train-mape:0.040657
## [22] train-mape:0.040778
## [23] train-mape:0.040671
## [24] train-mape:0.039878
## [25] train-mape:0.039961
## [26] train-mape:0.039687
## [27] train-mape:0.039632
## [28] train-mape:0.039209
## [29] train-mape:0.038819
## [30] train-mape:0.038874
## [31] train-mape:0.038663
## [32] train-mape:0.038696
## [33] train-mape:0.038012
## [34] train-mape:0.037129
## [35] train-mape:0.036589
## [36] train-mape:0.035790
## [37] train-mape:0.035764
## [38] train-mape:0.035021
## [39] train-mape:0.034860
## [40] train-mape:0.034735
## [41] train-mape:0.034370
## [42] train-mape:0.033570
## [43] train-mape:0.033557
## [44] train-mape:0.033234
## [45] train-mape:0.033373
## [46] train-mape:0.033362
## [47] train-mape:0.032567
## [48] train-mape:0.032420
## [49] train-mape:0.032493
## [50] train-mape:0.032514
## [51] train-mape:0.032531
## [52] train-mape:0.032436
## [53] train-mape:0.032072
## [54] train-mape:0.032088
## [55] train-mape:0.032097
## [56] train-mape:0.031280
## [57] train-mape:0.031304
## [58] train-mape:0.031302
## [59] train-mape:0.031330
## [60] train-mape:0.031363
## [61] train-mape:0.031366
## [62] train-mape:0.030567
## [63] train-mape:0.030498
## [64] train-mape:0.030050
## [65] train-mape:0.030095
## [66] train-mape:0.029600
## [67] train-mape:0.028712
## [68] train-mape:0.028711
## [69] train-mape:0.027937
## [70] train-mape:0.027920
## [71] train-mape:0.027888
## [72] train-mape:0.027926
## [73] train-mape:0.027899
## [74] train-mape:0.027859
## [75] train-mape:0.027854
## [76] train-mape:0.027939
## [77] train-mape:0.027958
## [78] train-mape:0.027328
## [79] train-mape:0.027317
## [80] train-mape:0.027261
## [81] train-mape:0.027290
## [82] train-mape:0.027278
## [83] train-mape:0.026377
## [84] train-mape:0.026380
## [85] train-mape:0.025927
## [86] train-mape:0.025650
## [87] train-mape:0.025027
## [88] train-mape:0.025043
## [89] train-mape:0.024665
## [90] train-mape:0.024239
## [91] train-mape:0.024240
## [92] train-mape:0.024301
## [93] train-mape:0.024357
## [94] train-mape:0.023583
## [95] train-mape:0.023630
## [96] train-mape:0.023359
## [97] train-mape:0.022833
## [98] train-mape:0.022824
## [99] train-mape:0.022809
## [100] train-mape:0.022879
prediction = as.data.frame(predict(model, dtest))
prediction = prediction %>%
mutate( id = test$id) %>%
mutate( warehouse = test$warehouse) %>%
mutate( date = test$date) %>%
rename( orders = `predict(model, dtest)`)
ggplot(prediction, aes(y = orders, x = date, color = as.factor(warehouse))) +
geom_line() +
labs(title="Prediction",
x="Date",
y="Value") +
theme_minimal()

train = train_df
test = test_df %>%
select(-id)
dtrain <- xgb.DMatrix(data = as.matrix(train[, -7]), label = train[, 7])
dtest <- xgb.DMatrix(data = as.matrix(test[, -7]), label = test[, 7])
prediction = as.data.frame(predict(model, dtest))
prediction = prediction %>%
mutate( id = test_df$id) %>%
mutate( warehouse = test$warehouse) %>%
mutate( date = test$date) %>%
rename( orders = `predict(model, dtest)`)
ggplot(prediction, aes(y = orders, x = date, color = as.factor(warehouse))) +
geom_line() +
labs(title="Prediction",
x="Date",
y="Value") +
theme_minimal()

prediction = merge(solution_example, prediction, by = "id") %>%
select(-c(orders.x, date, warehouse))
write.csv(prediction, 'prediction_12_xgboost_MAPE.csv',row.names = FALSE)