1. Executive Summary & Modeling Philosophy

Annual healthcare expenditure data typically exhibit two major actuarial characteristics: zero-inflation and heavy right-skewness. To address this problem, we adopt a Pure Premium Framework using a two-stage Frequency-Severity modeling approach.

The expected healthcare cost follows the actuarial decomposition:

\[ E(Y) = P(Y > 0) \times E(Y \mid Y > 0) \]

2. Dataset Overview

library(tidyverse)
library(xgboost)
library(Matrix)
library(glmnet)
library(gridExtra)
library(caret)

train <- read_csv("train.csv")
test <- read_csv("test.csv")

target_var <- "y"

leakage_vars <- c(
  "COST_NOITRU",
  "COST_NGOAITRU",
  "FRE_NOITRU",
  "FRE_NGOAITRU"
)

3. Preprocessing & Feature Engineering

3.1 Missing Value Handling

# Household ID
train <- train %>%
  mutate(HH_ID = gsub("_.*", "", ID))

test <- test %>%
  mutate(HH_ID = gsub("_.*", "", ID))

# Age imputation
median_age <- median(
  train$AGE,
  na.rm = TRUE
)

train <- train %>%
  mutate(
    AGE = coalesce(AGE, median_age)
  )

test <- test %>%
  mutate(
    AGE = coalesce(AGE, median_age)
  )

# Income variables
income_cols <- grep(
  "LUONG|TIEN|THUONG|TRO_CAP",
  colnames(train),
  value = TRUE
)

train <- train %>%
  mutate(
    across(
      all_of(income_cols),
      ~replace_na(.x, 0)
    )
  )

test <- test %>%
  mutate(
    across(
      all_of(income_cols),
      ~replace_na(.x, 0)
    )
  )

# Categorical variables
cat_cols <- c(
  "GENDER",
  "MARRIED_STATUS",
  "MA_NGHE_NGHIEP",
  "LOAI_THE_1",
  "LOAI_THE_2",
  "CO_THE_BAO_HIEM"
)

train <- train %>%
  mutate(
    across(
      all_of(cat_cols),
      ~as.factor(
        replace_na(
          as.character(.),
          "Unknown"
        )
      )
    )
  )

test <- test %>%
  mutate(
    across(
      all_of(cat_cols),
      ~as.factor(
        replace_na(
          as.character(.),
          "Unknown"
        )
      )
    )
  )

3.2 Feature Engineering

# Household-level features

hh_structure_train <- train %>%
  group_by(HH_ID) %>%
  summarise(
    HH_SIZE = n(),
    HH_MEAN_AGE = mean(
      AGE,
      na.rm = TRUE
    ),
    HH_CHILD_COUNT = sum(
      AGE < 18,
      na.rm = TRUE
    ),
    HH_ELDERLY_COUNT = sum(
      AGE > 60,
      na.rm = TRUE
    ),
    .groups = "drop"
  ) %>%
  mutate(
    HH_ELDERLY_RATIO =
      HH_ELDERLY_COUNT / HH_SIZE
  )

train <- train %>%
  left_join(
    hh_structure_train,
    by = "HH_ID"
  )

test <- test %>%
  left_join(
    hh_structure_train,
    by = "HH_ID"
  )

# Missing handling for test
for(col in c(
  "HH_SIZE",
  "HH_MEAN_AGE",
  "HH_CHILD_COUNT",
  "HH_ELDERLY_COUNT",
  "HH_ELDERLY_RATIO"
)) {

  med_val <- median(
    train[[col]],
    na.rm = TRUE
  )

  test <- test %>%
    mutate(
      !!sym(col) :=
        coalesce(
          !!sym(col),
          med_val
        )
    )
}

# Final features
train <- train %>%
  mutate(

    TONG_THU_NHAP =
      TIEN_LUONG_12T +
      TIEN_LE_TET +
      THUONG_PHU_CAP +
      LUONG_CONG_VIEC_2,

    AGE_SQ = AGE^2,

    HAS_INSURANCE =
      ifelse(
        CO_THE_BAO_HIEM == "1",
        1,
        0
      ),

    AGE_GROUP = as.factor(
      case_when(
        AGE <= 30 ~ "Under_30",
        AGE <= 50 ~ "Age_30_to_50",
        TRUE ~ "Above_50"
      )
    ),

    is_sick =
      ifelse(y > 1e-8, 1, 0)

  )

test <- test %>%
  mutate(

    TONG_THU_NHAP =
      TIEN_LUONG_12T +
      TIEN_LE_TET +
      THUONG_PHU_CAP +
      LUONG_CONG_VIEC_2,

    AGE_SQ = AGE^2,

    HAS_INSURANCE =
      ifelse(
        CO_THE_BAO_HIEM == "1",
        1,
        0
      ),

    AGE_GROUP = as.factor(
      case_when(
        AGE <= 30 ~ "Under_30",
        AGE <= 50 ~ "Age_30_to_50",
        TRUE ~ "Above_50"
      )
    )

  )

4. Exploratory Data Analysis

p1 <- ggplot(
  train,
  aes(x = expm1(y))
) +
  geom_histogram(
    bins = 50,
    fill = "#2c3e50",
    color = "white"
  ) +
  theme_minimal() +
  labs(
    title = "Original Cost Distribution"
  )

p2 <- ggplot(
  train %>% filter(y > 0),
  aes(x = y)
) +
  geom_histogram(
    bins = 40,
    fill = "#27ae60",
    color = "white"
  ) +
  theme_minimal() +
  labs(
    title = "Log-transformed Cost Distribution"
  )

grid.arrange(
  p1,
  p2,
  ncol = 2
)

5. Machine Learning Modeling

5.1 Sparse Matrix Preparation

cols_to_remove <- c(
  "ID",
  target_var,
  leakage_vars,
  "is_sick"
)

train_feats <- train %>%
  select(
    -any_of(cols_to_remove)
  ) %>%
  mutate(is_train = 1)

test_feats <- test %>%
  select(
    -any_of(c(
      "ID",
      leakage_vars,
      "is_sick"
    ))
  ) %>%
  mutate(is_train = 0)

all_data <- bind_rows(
  train_feats,
  test_feats
)

num_cols <- all_data %>%
  select(
    where(is.numeric),
    -is_train,
    -HH_ID
  ) %>%
  names()

for(col in num_cols){

  all_data <- all_data %>%
    mutate(
      !!sym(col) :=
        coalesce(
          !!sym(col),
          median(
            all_data[[col]],
            na.rm = TRUE
          )
        )
    )
}

old_opts <- options(
  na.action = "na.pass"
)

sparse_matrix <- sparse.model.matrix(
  ~ . - 1,
  data = all_data %>%
    select(-HH_ID)
)

options(old_opts)

X_train_full <- sparse_matrix[
  sparse_matrix[, "is_train"] == 1,
]

X_test <- sparse_matrix[
  sparse_matrix[, "is_train"] == 0,
]

X_train_full <- X_train_full[
  ,
  colnames(X_train_full) != "is_train"
]

X_test <- X_test[
  ,
  colnames(X_test) != "is_train"
]

y_train_full <- train$y

is_sick_full <- train$is_sick

5.2 Stage 1 – Frequency Model

dtrain_class <- xgb.DMatrix(
  data = X_train_full,
  label = is_sick_full
)

params_class <- list(
  booster = "gbtree",
  objective = "binary:logistic",
  eval_metric = "auc",
  eta = 0.02,
  max_depth = 5
)

cv_class <- xgb.cv(
  params = params_class,
  data = dtrain_class,
  nrounds = 300,
  nfold = 5,
  early_stopping_rounds = 30,
  verbose = 0
)

best_iter_class <-
  cv_class$best_iteration

if(
  is.null(best_iter_class) ||
  length(best_iter_class) == 0
){

  best_iter_class <- which.max(
    cv_class$evaluation_log$
      test_auc_mean
  )

}

if(
  is.null(best_iter_class) ||
  length(best_iter_class) == 0
){

  best_iter_class <- 300

}

cat(
  "Best validation AUC:",
  max(
    cv_class$evaluation_log$
      test_auc_mean
  ),
  "\n"
)
## Best validation AUC: 0.6797139
cat(
  "Best iteration:",
  best_iter_class,
  "\n"
)
## Best iteration: 297
final_model_class <- xgb.train(
  params = params_class,
  data = dtrain_class,
  nrounds = best_iter_class
)

prob_sick_test <- predict(
  final_model_class,
  xgb.DMatrix(X_test)
)

5.3 Stage 2 – Severity Model

sick_train_idx <- which(
  is_sick_full == 1
)

X_train_sick <-
  X_train_full[
    sick_train_idx,
  ]

y_train_sick <-
  y_train_full[
    sick_train_idx
  ]

dtrain_reg <- xgb.DMatrix(
  data = X_train_sick,
  label = y_train_sick
)

params_reg <- list(
  booster = "gbtree",
  objective = "reg:squarederror",
  eval_metric = "rmse",
  eta = 0.015,
  max_depth = 5
)

cv_reg <- xgb.cv(
  params = params_reg,
  data = dtrain_reg,
  nrounds = 300,
  nfold = 5,
  early_stopping_rounds = 30,
  verbose = 0
)

best_iter_reg <-
  cv_reg$best_iteration

if(
  is.null(best_iter_reg) ||
  length(best_iter_reg) == 0
){

  best_iter_reg <- which.min(
    cv_reg$evaluation_log$
      test_rmse_mean
  )

}

if(
  is.null(best_iter_reg) ||
  length(best_iter_reg) == 0
){

  best_iter_reg <- 300

}

cat(
  "Best validation RMSE:",
  min(
    cv_reg$evaluation_log$
      test_rmse_mean
  ),
  "\n"
)
## Best validation RMSE: 1.755388
cat(
  "Best iteration:",
  best_iter_reg,
  "\n"
)
## Best iteration: 217
final_model_reg <- xgb.train(
  params = params_reg,
  data = dtrain_reg,
  nrounds = best_iter_reg
)

cost_xgb_test <- pmax(

  predict(
    final_model_reg,
    xgb.DMatrix(X_test)
  ),

  0

)

6. Final Prediction & Submission

final_preds <-
  prob_sick_test *
  cost_xgb_test

final_preds[
  prob_sick_test < 0.01
] <- 0

final_preds[
  final_preds < 0
] <- 0

max_logical_y <- quantile(
  y_train_full,
  0.995,
  na.rm = TRUE
)

final_preds <- pmin(
  final_preds,
  max_logical_y
)

submission <- data.frame(
  ID = test$ID,
  y.hat = final_preds
)

write.csv(
  submission,
  "submission_Actuary_65_Neu_Ha.csv",
  row.names = FALSE
)

head(submission)
##              ID     y.hat
## 1 00001037008_3 0.8382454
## 2 00001037012_1 0.5579517
## 3 00001037015_2 2.7620220
## 4 00004004007_5 0.9505327
## 5 00004004007_4 0.7073528
## 6 00004004012_3 0.8862574

```