Laporan Tugas Kaggle Food Demand Forecasting

Author

Rosita Ria Rusesta - M0501251016

Latar Belakang

Perusahaan pengiriman makanan yang beroperasi di beberapa kota telah mendirikan berbagai pusat pemenuhan (fulfilment center ) untuk mengatur pengiriman pesanan makanan kepada pelanggan. Mengingat bahan baku yang digunakan mayoritas adalah bahan yang mudah busuk, pengadaan dan perencanaan stok mingguan menjadi sangat kritis. Selain itu, perencanaan tenaga kerja di pusat-pusat ini juga membutuhkan ramalan permintaan yang akurat untuk efisiensi operasional.

Pengisian ulang untuk sebagian besar bahan baku dilakukan setiap minggu. Tujuan dari proyek ini adalah untuk mengembangkan model yang dapat memprediksi permintaan untuk kombinasi pusat- makanan untuk 10 minggu ke depan (Minggu: 136-145-Test Dataset). Ramalan yang akurat membuat pusat-pusat pemenuhan dapat merencanakan pengadaan bahan baku dan penjadwalan staf dengan lebih efektif, sehingga mengurangi pemborosan dan meningkatkan kepuasan pelanggan.

# =====================
# LIBRARIES
# =====================
library(dplyr)
library(readr)
library(zoo)
library(lightgbm)
library(xgboost)
library(ranger)
library(data.table)
library(ggplot2)
library(plotly)
library(scales)
library(corrplot)
library(tidyverse)
library(ggpubr)
library(tidyr)
library(tictoc)

# =====================
# LOAD DATA
# =====================
All_data <- read_csv("All_data.csv", show_col_types = FALSE)
new_data <- read_csv("data_baru.csv", show_col_types = FALSE)
center_info <- read_csv("fulfilment_center_info.csv", show_col_types = FALSE)
meal_info <- read_csv("meal_info.csv", show_col_types = FALSE)

# =====================
# JOIN DATA
# =====================
df <- All_data %>%
  left_join(center_info, by = c("center_id" = "id")) %>%
  left_join(meal_info, by = "meal_id")
base_df <- df
history_data <- base_df

future_raw <- new_data %>%
  left_join(center_info, by = c("center_id"="id")) %>%
  left_join(meal_info, by="meal_id")

# =====================
# STRUKTUR DATA
# =====================
df <- df %>%
  mutate(across(c(id, center_id, meal_id,city_code, region_code), as.character))

str(df)

tibble [423,727 × 15] (S3: tbl_df/tbl/data.frame)
 $ id                   : chr [1:423727] "1379560" "1466964" "1346989" "1338232" ...
 $ week                 : num [1:423727] 1 1 1 1 1 1 1 1 1 1 ...
 $ center_id            : chr [1:423727] "55" "55" "55" "55" ...
 $ meal_id              : chr [1:423727] "1885" "1993" "2539" "2139" ...
 $ checkout_price       : num [1:423727] 137 137 135 340 244 ...
 $ base_price           : num [1:423727] 152 136 136 438 242 ...
 $ emailer_for_promotion: num [1:423727] 0 0 0 0 0 0 0 0 0 0 ...
 $ homepage_featured    : num [1:423727] 0 0 0 0 0 0 0 0 0 1 ...
 $ num_orders           : num [1:423727] 177 270 189 54 40 28 190 391 472 676 ...
 $ city_code            : chr [1:423727] "647" "647" "647" "647" ...
 $ region_code          : chr [1:423727] "56" "56" "56" "56" ...
 $ type                 : chr [1:423727] "TYPE_C" "TYPE_C" "TYPE_C" "TYPE_C" ...
 $ op_area              : num [1:423727] 2 2 2 2 2 2 2 2 2 2 ...
 $ category             : chr [1:423727] "Beverages" "Beverages" "Beverages" "Beverages" ...
 $ cuisine              : chr [1:423727] "Thai" "Thai" "Thai" "Indian" ...

summary(df$week)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   37.00   70.00   69.68  103.00  135.00

#Cek Missing Value
sum(is.na(df))

[1] 0

Exploratory Data Analysis (EDA)

# ==========================
# Exploratory Data Analysis
# =========================
df %>%
  group_by(week) %>%
  summarise(mean_orders = mean(num_orders)) %>%
  ggplot(aes(x = week, y = mean_orders)) +
  geom_line(color = "blue") +
  theme_minimal()

#Eksplorasi Distribusi variabel target --> num_orders
#Plot num_orders
ggplot(df, aes(x = num_orders)) +
  geom_histogram(bins = 50, fill = "grey30", color = "white") +
  geom_density(aes(y = after_stat(count) * (max(df$num_orders, na.rm = TRUE)/50)), 
               color = "red", linewidth = 1) +
  theme_minimal()

Data mingguan numorders menunjukkan bahwa jumlah pesanan berfluktuasi dari minggu ke minggu, maka dari itu perlu memasukkan unsur waktu kedalam model. Sedangkan dilihat dari distribusinya, jumlah pesanan (num_orders) menunjukkan kemiringan yang ekstrem (ada pesanan yang sangat sedikit tapi juga ada yang ribuan porsi), ini menjadi alasan kuat untuk menggunakan log1p(num_orders). Transformasi logaritma membantu “merapikan” distribusi ini agar model machine learning tidak bingung oleh angka-angka ekstrem.

promo_impact <- df %>%
  group_by(emailer_for_promotion, homepage_featured) %>%
  summarise(mean_num_orders = mean(num_orders, na.rm = TRUE), .groups = "drop") %>%
  mutate(
    emailer_for_promotion = ifelse(emailer_for_promotion == 1, "Email", "No Email"),
    homepage_featured = ifelse(homepage_featured == 1, "Featured", "No Featured")
  )

ggplot(promo_impact,
       aes(emailer_for_promotion, mean_num_orders, fill = homepage_featured)) +
  geom_col(position = "dodge", width = 0.6) +
  geom_text(aes(label = comma(mean_num_orders)),
            position = position_dodge(0.6),
            vjust = -0.4, size = 3.5) +
  scale_fill_manual(values = c("No Featured" = "#6C8EBF",
                               "Featured" = "#F4A261")) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0, .1))) +
  labs(title = "Promotional Impact on Orders",
       x = "Email Promotion",
       y = "Mean - Number of Orders",
       fill = "Homepage") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = .5, face = "bold"),
        legend.position = "top",
        panel.grid.major.x = element_blank())

Email Promosi :

a. Saat tidak ada email promosi, jumlah pesanan cenderung lebih rendah, baik produk ditambilkan di Homepage maupun tidak

b. Ketika email promosi dikirim, jumlah pesanan meningkat secara signifikan. ini menunjukkan bahwa email berperan sebagai pengingat atau pendoroang keputusan pembelian pelanggan

Penempatan di Homepage :

a. Produk yang ditampilkan di homepage secara konsisten menghasilkan jumlah pesanan lebih tinggi dibanding produk yang tidak ditampilkan

b. Bahkan tanpa email promosi, penempatan di homepage tetap meningkatkan visibilitas dan minat pelanggan

#Eksplorasi nilai lift
df2 <- df %>%
  mutate(
    category_cuisine = paste(category, cuisine, sep = " - "),
    price_diff = base_price - checkout_price,
    discount_pct = price_diff/(base_price+1),
    discount_group = ifelse(discount_pct > median(discount_pct), 
                            "High Discount", "Low Discount")
  )
#menampilkan tabel category-cuisine dengan total_orders, mean orders, dan lift
df_lift <- df2 %>% 
  group_by(category_cuisine, discount_group) %>%
  summarise(
    total_orders_group = sum(num_orders, na.rm = TRUE),
    avg_orders_group = mean(num_orders, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = discount_group,
    values_from = c(total_orders_group, avg_orders_group),
    names_glue = "{.value}_{discount_group}"
  ) %>%
  mutate(
    lift = `avg_orders_group_High Discount` / `avg_orders_group_Low Discount`,
    total_orders = `total_orders_group_High Discount` + `total_orders_group_Low Discount`,
    avg_orders = (`avg_orders_group_High Discount` + `avg_orders_group_Low Discount`) / 2
  ) %>%
  select(category_cuisine, total_orders, avg_orders, lift) %>%  # hanya kolom yang dibutuhkan
  arrange(desc(lift)) 

print(df_lift)

# A tibble: 17 × 4
   category_cuisine        total_orders avg_orders  lift
   <chr>                          <dbl>      <dbl> <dbl>
 1 Seafood - Continental        2605369      105.  2.83 
 2 Pizza - Continental          6759257      211.  2.41 
 3 Sandwich - Italian          16756602      544.  2.11 
 4 Rice Bowl - Indian          19654461      629.  1.64 
 5 Beverages - Indian           2169758       78.3 1.61 
 6 Salad - Italian             10230201      404.  1.52 
 7 Beverages - Continental      5672472      187.  1.45 
 8 Desert - Indian              1778903       65.0 1.44 
 9 Other Snacks - Thai          4445788      160.  1.41 
10 Pasta - Italian              1521890       57.7 1.37 
11 Fish - Continental            733942       85.7 1.32 
12 Beverages - Italian         12964264      418.  1.27 
13 Starters - Thai              4445465      163.  1.24 
14 Biryani - Indian              580955       30.5 1.07 
15 Beverages - Thai            17120478      565.  1.03 
16 Soup - Thai                   937261       79.8 1.02 
17 Extras - Thai                3720139      309.  0.804

Lift merupakan metrik yang digunakan untuk mengukur seberapa besar peningkatan suatu pesanan makanan akibat adanya faktor tertentu seperti diskon atau promosi dibandingkan dengan kondisi normal tanpa faktor tersebut.

Lift = 1 → tidak sensitif
Lift = 1.5 → naik 50% saat diskon tinggi
Lift > 2 → sangat sensitif
Kategori dengan lift mendekati 1 = tidak perlu diskon besar.

Biasanya kategori yang tidak perlu diskon besar:

Premium meal
Dessert tertentu
Beverage brand kuat
Comfort food populer

Karena:

Sudah punya loyal customer
Dibeli bukan karena harga
Repeat order tinggi

ggplot(df_lift, aes(x = reorder(category_cuisine, total_orders), y = total_orders, fill = lift)) +
  geom_col() +
  geom_text(aes(label = round(lift, 2),  # tampilkan lift dengan 2 desimal
                y = total_orders + max(total_orders)*0.02),  # geser sedikit ke kanan
            hjust = 0,  # teks di kanan bar
            size = 3) +
  coord_flip() +  # horizontal bar
  labs(
    title = "Total Orders per Category + Cuisine",
    x = "Category - Cuisine",
    y = "Total Orders",
    fill = "Lift (High vs Low Discount)"
  ) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal()

Feature Engineering

# =====================
# FEATURE ENGINEERING 
# =====================

# FOR CATEGORY LIFT

lift_table <- base_df %>%
  mutate(
    price_diff = base_price - checkout_price,
    discount_pct = price_diff/(base_price+1),
    discount_group = ifelse(
      discount_pct > median(discount_pct, na.rm=TRUE),
      "high","low"
    )
  ) %>%
  group_by(category, discount_group) %>%
  summarise(
    avg_orders = mean(num_orders, na.rm=TRUE),
    .groups="drop"
  ) %>%
  pivot_wider(
    names_from = discount_group,
    values_from = avg_orders,
    values_fill = 0
  ) %>%
  mutate(
    high = ifelse(is.na(high),0,high),
    low  = ifelse(is.na(low) | low==0,1,low),
    category_lift = high/low,
    category_lift = ifelse(
      is.na(category_lift) | is.infinite(category_lift),
      1,
      category_lift
    )
  ) %>%
  select(category, category_lift)

# SAFE EWMA

ewma_safe <- function(x, alpha){
  x <- as.numeric(x)
  n <- length(x)
  if(n <= 1) return(x)
  out <- numeric(n)
  out[1] <- x[1]
  for(i in 2:n){
    out[i] <- alpha*x[i] + (1-alpha)*out[i-1]
  }
  return(out)
}


create_features <- function(df){
  
  df <- df %>%
    left_join(lift_table, by="category")
  
  if(!"category_lift" %in% colnames(df)){
    df$category_lift <- 1
  }
  
  df <- df %>%
    mutate(
      category_lift = ifelse(is.na(category_lift),1,category_lift)
    ) %>%
    arrange(center_id, meal_id, week) %>%
    mutate(
      week_of_year = ((week - 1) %% 52) + 1,
      week_sin = sin(2*pi*week_of_year/52),
      week_cos = cos(2*pi*week_of_year/52),
      price_diff = base_price - checkout_price,
      discount_pct = price_diff/(base_price+1),
      is_promo = emailer_for_promotion + homepage_featured,
      promo_strength = is_promo*discount_pct,
      promo_x_price = promo_strength * base_price,
      promo_category_interaction = is_promo * category_lift
    ) %>%
    group_by(center_id, meal_id) %>%
    mutate(
      lag_1  = lag(num_orders,1),
      lag_2  = lag(num_orders,2),
      lag_7  = lag(num_orders,7),
      ewma_7  = lag(ewma_safe(num_orders,0.3)),
      ewma_14 = lag(ewma_safe(num_orders,0.15)),
      rolling_mean_7  = rollapplyr(lag_1,7,mean,fill=NA),
      rolling_std_7   = rollapplyr(lag_1,7,sd,fill=NA),
      momentum = lag_1 - lag_7,
      center_meal_avg = lag(cummean(num_orders))
    ) %>%
    ungroup()
  
  return(df)
}

Pembagian Data Training dan Testing

Data Training dan Testing dibagi tidak dengan cara random, melainkan berdasarkan pembagian urutan waktu (week)

# ========================
# SPLIT TRAINING TESTING
# =======================

df <- create_features(base_df)
df <- df %>% filter(!is.na(rolling_mean_7))

df$num_orders_log <- log1p(df$num_orders)

cat_cols <- c("center_id","meal_id","category","cuisine",
              "type","city_code","region_code")

df[cat_cols] <- lapply(df[cat_cols], as.factor)

train_df <- df %>% filter(week <= 108)
test_df  <- df %>% filter(week > 108)

drop_cols <- c("id","num_orders","num_orders_log")
features <- setdiff(names(train_df), drop_cols)

train_matrix <- data.matrix(train_df[,features])
test_matrix  <- data.matrix(test_df[,features])

lgb_train <- lgb.Dataset(train_matrix,
                         label=train_df$num_orders_log)

lgb_val <- lgb.Dataset(test_matrix,
                       label=test_df$num_orders_log)

dtrain <- xgb.DMatrix(train_matrix, label=train_df$num_orders_log)
dtest  <- xgb.DMatrix(test_matrix, label=test_df$num_orders_log)

# ============
# CUSTOM MAPE
# ============

mape_eval <- function(preds, dtrain){
  labels <- expm1(get_field(dtrain,"label"))
  preds_real <- expm1(preds)
  mape <- mean(abs((labels - preds_real)/pmax(labels,1)))
  list(name="MAPE", value=mape, higher_better=FALSE)
}

ML Algoritm : LightGBM, XGBoost, dan Random Forest

# =====================
# MODEL CANDIDATE
# =====================
# LIGHTGBM PARAMETERS
params1 <- list(
  objective="regression",
  metric="None",
  learning_rate=0.03,
  num_leaves=130,
  min_data_in_leaf=120,
  feature_fraction=0.85,
  bagging_fraction=0.8,
  bagging_freq=5,
  lambda_l1=1,
  lambda_l2=2,
  verbosity=-1
)
waktu_eksekusi1 <- system.time({
model.lightGBM <- lgb.train(
  params=params1,
  data=lgb_train,
  nrounds=5000,
  valids=list(test=lgb_val),
  eval=mape_eval,
  early_stopping_rounds=600
)
})

# XGBOOST PARAMETERS

params2 <- list(
  objective = "reg:squarederror",
  eta = 0.03,
  max_depth = 7,
  min_child_weight = 130,
  gamma = 2,
  subsample = 0.8,
  colsample_bytree = 0.85,
  lambda = 1,
  alpha = 2
)

waktu_eksekusi2 <- system.time({
model.XGBoost <- xgb.train(
  params = params2,
  data = dtrain,
  nrounds = 5000,
  evals = list(test = dtest),
  early_stopping_rounds = 600,
  maximize = FALSE,
  verbose = 0
)
})
# RF PARAMETERS
waktu_eksekusi3 <- system.time({
model_rf <- ranger(
  formula         = num_orders_log ~ ., 
  data            = train_df[, c(features, "num_orders_log")],
  num.trees       = 300,        # Jumlah pohon
  mtry            = floor(length(features)/3), # Parameter tuning standar RF
  min.node.size   = 5,
  importance      = 'impurity', # Agar kita bisa cek Feature Importance
  verbose         = TRUE,
  seed            = 123
)
})

Growing trees.. Progress: 16%. Estimated remaining time: 2 minutes, 52 seconds.
Growing trees.. Progress: 31%. Estimated remaining time: 2 minutes, 20 seconds.
Growing trees.. Progress: 46%. Estimated remaining time: 1 minute, 53 seconds.
Growing trees.. Progress: 60%. Estimated remaining time: 1 minute, 23 seconds.
Growing trees.. Progress: 75%. Estimated remaining time: 52 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 22 seconds.

Evaluation Model Prediction

# ==========================================
# VALIDATION LIGHTGBM
# ==========================================

pred_log1 <- predict(model.lightGBM,test_matrix)
pred1 <- pmax(expm1(pred_log1),0)

MAPE <- mean(abs((test_df$num_orders - pred1)/
                   pmax(test_df$num_orders,1)))
R2 <- 1 - sum((test_df$num_orders - pred1)^2) /
  sum((test_df$num_orders - mean(test_df$num_orders))^2)
rmse <- sqrt(mean((pred1 - test_df$num_orders)^2))

# ==========================================
# VALIDATION XGBOOST
# ==========================================

pred_log2 <- predict(model.XGBoost,dtest)
pred2 <- pmax(expm1(pred_log2),0)

MAPE_2 <- mean(abs((test_df$num_orders - pred2)/
                   pmax(test_df$num_orders,1)))
R2_2 <- 1 - sum((test_df$num_orders - pred2)^2) /
  sum((test_df$num_orders - mean(test_df$num_orders))^2)
rmse_2 <- sqrt(mean((pred2 - test_df$num_orders)^2))

# ==========================================
# VALIDATION RANDOM FOREST
# ==========================================
pred_rf <- predict(model_rf, data = test_df[, features])
pred_log3 <- pred_rf$predictions
pred3 <- pmax(expm1(pred_log3), 0)

MAPE_3 <- mean(abs((test_df$num_orders - pred3) / pmax(test_df$num_orders, 1)))
R2_3 <- 1 - sum((test_df$num_orders - pred3)^2) / sum((test_df$num_orders - mean(test_df$num_orders))^2)
rmse_3 <- sqrt(mean((pred3 - test_df$num_orders)^2))



# evaluasi model
summary_table <- data.frame(
  Metric = c("R-Squared", "MAPE", "RMSE", "Time (sec)"),
  LightGBM = sprintf("%.2f", c(R2, MAPE, rmse, waktu_eksekusi1[3])),
  XGBoost  = sprintf("%.2f", c(R2_2, MAPE_2, rmse_2, waktu_eksekusi2[3])),
  RandomForest  = sprintf("%.2f", c(R2_3, MAPE_3, rmse_3, waktu_eksekusi3[3]))
)

print(summary_table, row.names = FALSE)

     Metric LightGBM XGBoost RandomForest
  R-Squared     0.80    0.80         0.80
       MAPE     0.43    0.44         0.44
       RMSE   173.68  174.95       174.66
 Time (sec)    40.78   11.30       223.29

Recursive Prediction : Data Testing

# ==========================================
# RECURSIVE PREDICT
# ==========================================

future_df <- future_raw
future_df$num_orders <- NA

all_data <- bind_rows(history_data,future_df)
all_data[cat_cols] <- lapply(all_data[cat_cols], as.factor)

future_weeks <- sort(unique(future_df$week))

for(w in future_weeks){
  
  all_data <- create_features(all_data)
  
  pred_data <- all_data %>% filter(week==w)
  pred_matrix <- data.matrix(pred_data[,features])
  
  pred_log <- predict(model.lightGBM,pred_matrix)
  pred_value <- pmax(expm1(pred_log),0)
  
  all_data$num_orders[all_data$week==w &
                        is.na(all_data$num_orders)] <- pred_value
}

# ==========================================
# SUBMISSION
# ==========================================

final_prediction <- all_data %>%
  filter(week %in% future_weeks)

submission <- read_csv("sample_submissions.csv")

Rows: 32821 Columns: 2
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (2): id, num_orders

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

submission_final <- submission %>%
  select(id) %>%
  left_join(final_prediction,by="id") %>%
  select(id,num_orders)

write_csv(submission_final,"submission.csv")

Feature Importance

Feature Importance dimanfaatkan untuk memberikan skor pada setiap fitur (variabel) berdasarkan seberapa besar kontribusinya dalam membantu model membuat prediksi yang akurat. Model terbaik yang dipilih adalah LightGBM, sehingga feature Importance berdasarkan model LightGBM.

# ==========================================
# FEATURE IMPORTANCE
# ==========================================
importance1 <- lgb.importance(model.lightGBM, percentage = TRUE)


importance_plot <- importance1 %>%
  arrange(desc(Gain)) %>%
  slice_head(n = 20) %>%
  mutate(Feature = reorder(Feature, Gain))

# Plot Feature Importance
ggplot(importance_plot, aes(x = Feature, y = Gain)) +
  geom_col(fill = "#6C5CE7", width = 0.7) +
  coord_flip() +
  theme_minimal(base_size = 13) +
  labs(
    title = "Top 20 Feature Importance - LightGBM",
    subtitle = "Based on Gain Contribution",
    x = "",
    y = "Gain (%)"
  ) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(color = "gray40"),
    axis.text.y = element_text(face = "bold"),
    panel.grid.major.y = element_blank()
  )