# =====================
# LIBRARIES
# =====================
library(dplyr)
library(readr)
library(zoo)
library(lightgbm)
library(xgboost)
library(ranger)
library(data.table)
library(ggplot2)
library(plotly)
library(scales)
library(corrplot)
library(tidyverse)
library(ggpubr)
library(tidyr)
library(tictoc)Laporan Tugas Kaggle Food Demand Forecasting
Latar Belakang
Perusahaan pengiriman makanan yang beroperasi di beberapa kota telah mendirikan berbagai pusat pemenuhan (fulfilment center ) untuk mengatur pengiriman pesanan makanan kepada pelanggan. Mengingat bahan baku yang digunakan mayoritas adalah bahan yang mudah busuk, pengadaan dan perencanaan stok mingguan menjadi sangat kritis. Selain itu, perencanaan tenaga kerja di pusat-pusat ini juga membutuhkan ramalan permintaan yang akurat untuk efisiensi operasional.
Pengisian ulang untuk sebagian besar bahan baku dilakukan setiap minggu. Tujuan dari proyek ini adalah untuk mengembangkan model yang dapat memprediksi permintaan untuk kombinasi pusat- makanan untuk 10 minggu ke depan (Minggu: 136-145-Test Dataset). Ramalan yang akurat membuat pusat-pusat pemenuhan dapat merencanakan pengadaan bahan baku dan penjadwalan staf dengan lebih efektif, sehingga mengurangi pemborosan dan meningkatkan kepuasan pelanggan.
# =====================
# LOAD DATA
# =====================
All_data <- read_csv("All_data.csv", show_col_types = FALSE)
new_data <- read_csv("data_baru.csv", show_col_types = FALSE)
center_info <- read_csv("fulfilment_center_info.csv", show_col_types = FALSE)
meal_info <- read_csv("meal_info.csv", show_col_types = FALSE)
# =====================
# JOIN DATA
# =====================
df <- All_data %>%
left_join(center_info, by = c("center_id" = "id")) %>%
left_join(meal_info, by = "meal_id")
base_df <- df
history_data <- base_df
future_raw <- new_data %>%
left_join(center_info, by = c("center_id"="id")) %>%
left_join(meal_info, by="meal_id")# =====================
# STRUKTUR DATA
# =====================
df <- df %>%
mutate(across(c(id, center_id, meal_id,city_code, region_code), as.character))
str(df)tibble [423,727 × 15] (S3: tbl_df/tbl/data.frame)
$ id : chr [1:423727] "1379560" "1466964" "1346989" "1338232" ...
$ week : num [1:423727] 1 1 1 1 1 1 1 1 1 1 ...
$ center_id : chr [1:423727] "55" "55" "55" "55" ...
$ meal_id : chr [1:423727] "1885" "1993" "2539" "2139" ...
$ checkout_price : num [1:423727] 137 137 135 340 244 ...
$ base_price : num [1:423727] 152 136 136 438 242 ...
$ emailer_for_promotion: num [1:423727] 0 0 0 0 0 0 0 0 0 0 ...
$ homepage_featured : num [1:423727] 0 0 0 0 0 0 0 0 0 1 ...
$ num_orders : num [1:423727] 177 270 189 54 40 28 190 391 472 676 ...
$ city_code : chr [1:423727] "647" "647" "647" "647" ...
$ region_code : chr [1:423727] "56" "56" "56" "56" ...
$ type : chr [1:423727] "TYPE_C" "TYPE_C" "TYPE_C" "TYPE_C" ...
$ op_area : num [1:423727] 2 2 2 2 2 2 2 2 2 2 ...
$ category : chr [1:423727] "Beverages" "Beverages" "Beverages" "Beverages" ...
$ cuisine : chr [1:423727] "Thai" "Thai" "Thai" "Indian" ...
summary(df$week) Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 37.00 70.00 69.68 103.00 135.00
#Cek Missing Value
sum(is.na(df))[1] 0
Exploratory Data Analysis (EDA)
# ==========================
# Exploratory Data Analysis
# =========================
df %>%
group_by(week) %>%
summarise(mean_orders = mean(num_orders)) %>%
ggplot(aes(x = week, y = mean_orders)) +
geom_line(color = "blue") +
theme_minimal()#Eksplorasi Distribusi variabel target --> num_orders
#Plot num_orders
ggplot(df, aes(x = num_orders)) +
geom_histogram(bins = 50, fill = "grey30", color = "white") +
geom_density(aes(y = after_stat(count) * (max(df$num_orders, na.rm = TRUE)/50)),
color = "red", linewidth = 1) +
theme_minimal()Data mingguan numorders menunjukkan bahwa jumlah pesanan berfluktuasi dari minggu ke minggu, maka dari itu perlu memasukkan unsur waktu kedalam model. Sedangkan dilihat dari distribusinya, jumlah pesanan (num_orders) menunjukkan kemiringan yang ekstrem (ada pesanan yang sangat sedikit tapi juga ada yang ribuan porsi), ini menjadi alasan kuat untuk menggunakan log1p(num_orders). Transformasi logaritma membantu “merapikan” distribusi ini agar model machine learning tidak bingung oleh angka-angka ekstrem.
promo_impact <- df %>%
group_by(emailer_for_promotion, homepage_featured) %>%
summarise(mean_num_orders = mean(num_orders, na.rm = TRUE), .groups = "drop") %>%
mutate(
emailer_for_promotion = ifelse(emailer_for_promotion == 1, "Email", "No Email"),
homepage_featured = ifelse(homepage_featured == 1, "Featured", "No Featured")
)
ggplot(promo_impact,
aes(emailer_for_promotion, mean_num_orders, fill = homepage_featured)) +
geom_col(position = "dodge", width = 0.6) +
geom_text(aes(label = comma(mean_num_orders)),
position = position_dodge(0.6),
vjust = -0.4, size = 3.5) +
scale_fill_manual(values = c("No Featured" = "#6C8EBF",
"Featured" = "#F4A261")) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0, .1))) +
labs(title = "Promotional Impact on Orders",
x = "Email Promotion",
y = "Mean - Number of Orders",
fill = "Homepage") +
theme_minimal() +
theme(plot.title = element_text(hjust = .5, face = "bold"),
legend.position = "top",
panel.grid.major.x = element_blank())- Email Promosi :
a. Saat tidak ada email promosi, jumlah pesanan cenderung lebih rendah, baik produk ditambilkan di Homepage maupun tidak
b. Ketika email promosi dikirim, jumlah pesanan meningkat secara signifikan. ini menunjukkan bahwa email berperan sebagai pengingat atau pendoroang keputusan pembelian pelanggan
- Penempatan di Homepage :
a. Produk yang ditampilkan di homepage secara konsisten menghasilkan jumlah pesanan lebih tinggi dibanding produk yang tidak ditampilkan
b. Bahkan tanpa email promosi, penempatan di homepage tetap meningkatkan visibilitas dan minat pelanggan
#Eksplorasi nilai lift
df2 <- df %>%
mutate(
category_cuisine = paste(category, cuisine, sep = " - "),
price_diff = base_price - checkout_price,
discount_pct = price_diff/(base_price+1),
discount_group = ifelse(discount_pct > median(discount_pct),
"High Discount", "Low Discount")
)
#menampilkan tabel category-cuisine dengan total_orders, mean orders, dan lift
df_lift <- df2 %>%
group_by(category_cuisine, discount_group) %>%
summarise(
total_orders_group = sum(num_orders, na.rm = TRUE),
avg_orders_group = mean(num_orders, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_wider(
names_from = discount_group,
values_from = c(total_orders_group, avg_orders_group),
names_glue = "{.value}_{discount_group}"
) %>%
mutate(
lift = `avg_orders_group_High Discount` / `avg_orders_group_Low Discount`,
total_orders = `total_orders_group_High Discount` + `total_orders_group_Low Discount`,
avg_orders = (`avg_orders_group_High Discount` + `avg_orders_group_Low Discount`) / 2
) %>%
select(category_cuisine, total_orders, avg_orders, lift) %>% # hanya kolom yang dibutuhkan
arrange(desc(lift))
print(df_lift)# A tibble: 17 × 4
category_cuisine total_orders avg_orders lift
<chr> <dbl> <dbl> <dbl>
1 Seafood - Continental 2605369 105. 2.83
2 Pizza - Continental 6759257 211. 2.41
3 Sandwich - Italian 16756602 544. 2.11
4 Rice Bowl - Indian 19654461 629. 1.64
5 Beverages - Indian 2169758 78.3 1.61
6 Salad - Italian 10230201 404. 1.52
7 Beverages - Continental 5672472 187. 1.45
8 Desert - Indian 1778903 65.0 1.44
9 Other Snacks - Thai 4445788 160. 1.41
10 Pasta - Italian 1521890 57.7 1.37
11 Fish - Continental 733942 85.7 1.32
12 Beverages - Italian 12964264 418. 1.27
13 Starters - Thai 4445465 163. 1.24
14 Biryani - Indian 580955 30.5 1.07
15 Beverages - Thai 17120478 565. 1.03
16 Soup - Thai 937261 79.8 1.02
17 Extras - Thai 3720139 309. 0.804
Lift merupakan metrik yang digunakan untuk mengukur seberapa besar peningkatan suatu pesanan makanan akibat adanya faktor tertentu seperti diskon atau promosi dibandingkan dengan kondisi normal tanpa faktor tersebut.
Lift = 1 → tidak sensitif
Lift = 1.5 → naik 50% saat diskon tinggi
Lift > 2 → sangat sensitif
Kategori dengan lift mendekati 1 = tidak perlu diskon besar.
Biasanya kategori yang tidak perlu diskon besar:
Premium meal
Dessert tertentu
Beverage brand kuat
Comfort food populer
Karena:
Sudah punya loyal customer
Dibeli bukan karena harga
Repeat order tinggi
ggplot(df_lift, aes(x = reorder(category_cuisine, total_orders), y = total_orders, fill = lift)) +
geom_col() +
geom_text(aes(label = round(lift, 2), # tampilkan lift dengan 2 desimal
y = total_orders + max(total_orders)*0.02), # geser sedikit ke kanan
hjust = 0, # teks di kanan bar
size = 3) +
coord_flip() + # horizontal bar
labs(
title = "Total Orders per Category + Cuisine",
x = "Category - Cuisine",
y = "Total Orders",
fill = "Lift (High vs Low Discount)"
) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal()Feature Engineering
# =====================
# FEATURE ENGINEERING
# =====================
# FOR CATEGORY LIFT
lift_table <- base_df %>%
mutate(
price_diff = base_price - checkout_price,
discount_pct = price_diff/(base_price+1),
discount_group = ifelse(
discount_pct > median(discount_pct, na.rm=TRUE),
"high","low"
)
) %>%
group_by(category, discount_group) %>%
summarise(
avg_orders = mean(num_orders, na.rm=TRUE),
.groups="drop"
) %>%
pivot_wider(
names_from = discount_group,
values_from = avg_orders,
values_fill = 0
) %>%
mutate(
high = ifelse(is.na(high),0,high),
low = ifelse(is.na(low) | low==0,1,low),
category_lift = high/low,
category_lift = ifelse(
is.na(category_lift) | is.infinite(category_lift),
1,
category_lift
)
) %>%
select(category, category_lift)
# SAFE EWMA
ewma_safe <- function(x, alpha){
x <- as.numeric(x)
n <- length(x)
if(n <= 1) return(x)
out <- numeric(n)
out[1] <- x[1]
for(i in 2:n){
out[i] <- alpha*x[i] + (1-alpha)*out[i-1]
}
return(out)
}
create_features <- function(df){
df <- df %>%
left_join(lift_table, by="category")
if(!"category_lift" %in% colnames(df)){
df$category_lift <- 1
}
df <- df %>%
mutate(
category_lift = ifelse(is.na(category_lift),1,category_lift)
) %>%
arrange(center_id, meal_id, week) %>%
mutate(
week_of_year = ((week - 1) %% 52) + 1,
week_sin = sin(2*pi*week_of_year/52),
week_cos = cos(2*pi*week_of_year/52),
price_diff = base_price - checkout_price,
discount_pct = price_diff/(base_price+1),
is_promo = emailer_for_promotion + homepage_featured,
promo_strength = is_promo*discount_pct,
promo_x_price = promo_strength * base_price,
promo_category_interaction = is_promo * category_lift
) %>%
group_by(center_id, meal_id) %>%
mutate(
lag_1 = lag(num_orders,1),
lag_2 = lag(num_orders,2),
lag_7 = lag(num_orders,7),
ewma_7 = lag(ewma_safe(num_orders,0.3)),
ewma_14 = lag(ewma_safe(num_orders,0.15)),
rolling_mean_7 = rollapplyr(lag_1,7,mean,fill=NA),
rolling_std_7 = rollapplyr(lag_1,7,sd,fill=NA),
momentum = lag_1 - lag_7,
center_meal_avg = lag(cummean(num_orders))
) %>%
ungroup()
return(df)
}Pembagian Data Training dan Testing
Data Training dan Testing dibagi tidak dengan cara random, melainkan berdasarkan pembagian urutan waktu (week)
# ========================
# SPLIT TRAINING TESTING
# =======================
df <- create_features(base_df)
df <- df %>% filter(!is.na(rolling_mean_7))
df$num_orders_log <- log1p(df$num_orders)
cat_cols <- c("center_id","meal_id","category","cuisine",
"type","city_code","region_code")
df[cat_cols] <- lapply(df[cat_cols], as.factor)
train_df <- df %>% filter(week <= 108)
test_df <- df %>% filter(week > 108)
drop_cols <- c("id","num_orders","num_orders_log")
features <- setdiff(names(train_df), drop_cols)
train_matrix <- data.matrix(train_df[,features])
test_matrix <- data.matrix(test_df[,features])
lgb_train <- lgb.Dataset(train_matrix,
label=train_df$num_orders_log)
lgb_val <- lgb.Dataset(test_matrix,
label=test_df$num_orders_log)
dtrain <- xgb.DMatrix(train_matrix, label=train_df$num_orders_log)
dtest <- xgb.DMatrix(test_matrix, label=test_df$num_orders_log)# ============
# CUSTOM MAPE
# ============
mape_eval <- function(preds, dtrain){
labels <- expm1(get_field(dtrain,"label"))
preds_real <- expm1(preds)
mape <- mean(abs((labels - preds_real)/pmax(labels,1)))
list(name="MAPE", value=mape, higher_better=FALSE)
}ML Algoritm : LightGBM, XGBoost, dan Random Forest
# =====================
# MODEL CANDIDATE
# =====================
# LIGHTGBM PARAMETERS
params1 <- list(
objective="regression",
metric="None",
learning_rate=0.03,
num_leaves=130,
min_data_in_leaf=120,
feature_fraction=0.85,
bagging_fraction=0.8,
bagging_freq=5,
lambda_l1=1,
lambda_l2=2,
verbosity=-1
)
waktu_eksekusi1 <- system.time({
model.lightGBM <- lgb.train(
params=params1,
data=lgb_train,
nrounds=5000,
valids=list(test=lgb_val),
eval=mape_eval,
early_stopping_rounds=600
)
})
# XGBOOST PARAMETERS
params2 <- list(
objective = "reg:squarederror",
eta = 0.03,
max_depth = 7,
min_child_weight = 130,
gamma = 2,
subsample = 0.8,
colsample_bytree = 0.85,
lambda = 1,
alpha = 2
)
waktu_eksekusi2 <- system.time({
model.XGBoost <- xgb.train(
params = params2,
data = dtrain,
nrounds = 5000,
evals = list(test = dtest),
early_stopping_rounds = 600,
maximize = FALSE,
verbose = 0
)
})
# RF PARAMETERS
waktu_eksekusi3 <- system.time({
model_rf <- ranger(
formula = num_orders_log ~ .,
data = train_df[, c(features, "num_orders_log")],
num.trees = 300, # Jumlah pohon
mtry = floor(length(features)/3), # Parameter tuning standar RF
min.node.size = 5,
importance = 'impurity', # Agar kita bisa cek Feature Importance
verbose = TRUE,
seed = 123
)
})Growing trees.. Progress: 16%. Estimated remaining time: 2 minutes, 52 seconds.
Growing trees.. Progress: 31%. Estimated remaining time: 2 minutes, 20 seconds.
Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 53 seconds.
Growing trees.. Progress: 60%. Estimated remaining time: 1 minute, 23 seconds.
Growing trees.. Progress: 75%. Estimated remaining time: 52 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 22 seconds.
Evaluation Model Prediction
# ==========================================
# VALIDATION LIGHTGBM
# ==========================================
pred_log1 <- predict(model.lightGBM,test_matrix)
pred1 <- pmax(expm1(pred_log1),0)
MAPE <- mean(abs((test_df$num_orders - pred1)/
pmax(test_df$num_orders,1)))
R2 <- 1 - sum((test_df$num_orders - pred1)^2) /
sum((test_df$num_orders - mean(test_df$num_orders))^2)
rmse <- sqrt(mean((pred1 - test_df$num_orders)^2))
# ==========================================
# VALIDATION XGBOOST
# ==========================================
pred_log2 <- predict(model.XGBoost,dtest)
pred2 <- pmax(expm1(pred_log2),0)
MAPE_2 <- mean(abs((test_df$num_orders - pred2)/
pmax(test_df$num_orders,1)))
R2_2 <- 1 - sum((test_df$num_orders - pred2)^2) /
sum((test_df$num_orders - mean(test_df$num_orders))^2)
rmse_2 <- sqrt(mean((pred2 - test_df$num_orders)^2))
# ==========================================
# VALIDATION RANDOM FOREST
# ==========================================
pred_rf <- predict(model_rf, data = test_df[, features])
pred_log3 <- pred_rf$predictions
pred3 <- pmax(expm1(pred_log3), 0)
MAPE_3 <- mean(abs((test_df$num_orders - pred3) / pmax(test_df$num_orders, 1)))
R2_3 <- 1 - sum((test_df$num_orders - pred3)^2) / sum((test_df$num_orders - mean(test_df$num_orders))^2)
rmse_3 <- sqrt(mean((pred3 - test_df$num_orders)^2))
# evaluasi model
summary_table <- data.frame(
Metric = c("R-Squared", "MAPE", "RMSE", "Time (sec)"),
LightGBM = sprintf("%.2f", c(R2, MAPE, rmse, waktu_eksekusi1[3])),
XGBoost = sprintf("%.2f", c(R2_2, MAPE_2, rmse_2, waktu_eksekusi2[3])),
RandomForest = sprintf("%.2f", c(R2_3, MAPE_3, rmse_3, waktu_eksekusi3[3]))
)
print(summary_table, row.names = FALSE) Metric LightGBM XGBoost RandomForest
R-Squared 0.80 0.80 0.80
MAPE 0.43 0.44 0.44
RMSE 173.68 174.95 174.66
Time (sec) 40.78 11.30 223.29
Recursive Prediction : Data Testing
# ==========================================
# RECURSIVE PREDICT
# ==========================================
future_df <- future_raw
future_df$num_orders <- NA
all_data <- bind_rows(history_data,future_df)
all_data[cat_cols] <- lapply(all_data[cat_cols], as.factor)
future_weeks <- sort(unique(future_df$week))
for(w in future_weeks){
all_data <- create_features(all_data)
pred_data <- all_data %>% filter(week==w)
pred_matrix <- data.matrix(pred_data[,features])
pred_log <- predict(model.lightGBM,pred_matrix)
pred_value <- pmax(expm1(pred_log),0)
all_data$num_orders[all_data$week==w &
is.na(all_data$num_orders)] <- pred_value
}# ==========================================
# SUBMISSION
# ==========================================
final_prediction <- all_data %>%
filter(week %in% future_weeks)
submission <- read_csv("sample_submissions.csv")Rows: 32821 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (2): id, num_orders
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
submission_final <- submission %>%
select(id) %>%
left_join(final_prediction,by="id") %>%
select(id,num_orders)
write_csv(submission_final,"submission.csv")Feature Importance
Feature Importance dimanfaatkan untuk memberikan skor pada setiap fitur (variabel) berdasarkan seberapa besar kontribusinya dalam membantu model membuat prediksi yang akurat. Model terbaik yang dipilih adalah LightGBM, sehingga feature Importance berdasarkan model LightGBM.
# ==========================================
# FEATURE IMPORTANCE
# ==========================================
importance1 <- lgb.importance(model.lightGBM, percentage = TRUE)
importance_plot <- importance1 %>%
arrange(desc(Gain)) %>%
slice_head(n = 20) %>%
mutate(Feature = reorder(Feature, Gain))
# Plot Feature Importance
ggplot(importance_plot, aes(x = Feature, y = Gain)) +
geom_col(fill = "#6C5CE7", width = 0.7) +
coord_flip() +
theme_minimal(base_size = 13) +
labs(
title = "Top 20 Feature Importance - LightGBM",
subtitle = "Based on Gain Contribution",
x = "",
y = "Gain (%)"
) +
theme(
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(color = "gray40"),
axis.text.y = element_text(face = "bold"),
panel.grid.major.y = element_blank()
)