Annual healthcare expenditure data typically exhibit two major actuarial characteristics: zero-inflation and heavy right-skewness. To address this problem, we adopt a Pure Premium Framework using a two-stage Frequency-Severity modeling approach.
The expected healthcare cost follows the actuarial decomposition:
\[ E(Y) = P(Y > 0) \times E(Y \mid Y > 0) \]
# Household ID
train <- train %>%
mutate(HH_ID = gsub("_.*", "", ID))
test <- test %>%
mutate(HH_ID = gsub("_.*", "", ID))
# Age imputation
median_age <- median(
train$AGE,
na.rm = TRUE
)
train <- train %>%
mutate(
AGE = coalesce(AGE, median_age)
)
test <- test %>%
mutate(
AGE = coalesce(AGE, median_age)
)
# Income variables
income_cols <- grep(
"LUONG|TIEN|THUONG|TRO_CAP",
colnames(train),
value = TRUE
)
train <- train %>%
mutate(
across(
all_of(income_cols),
~replace_na(.x, 0)
)
)
test <- test %>%
mutate(
across(
all_of(income_cols),
~replace_na(.x, 0)
)
)
# Categorical variables
cat_cols <- c(
"GENDER",
"MARRIED_STATUS",
"MA_NGHE_NGHIEP",
"LOAI_THE_1",
"LOAI_THE_2",
"CO_THE_BAO_HIEM"
)
train <- train %>%
mutate(
across(
all_of(cat_cols),
~as.factor(
replace_na(
as.character(.),
"Unknown"
)
)
)
)
test <- test %>%
mutate(
across(
all_of(cat_cols),
~as.factor(
replace_na(
as.character(.),
"Unknown"
)
)
)
)# Household-level features
hh_structure_train <- train %>%
group_by(HH_ID) %>%
summarise(
HH_SIZE = n(),
HH_MEAN_AGE = mean(
AGE,
na.rm = TRUE
),
HH_CHILD_COUNT = sum(
AGE < 18,
na.rm = TRUE
),
HH_ELDERLY_COUNT = sum(
AGE > 60,
na.rm = TRUE
),
.groups = "drop"
) %>%
mutate(
HH_ELDERLY_RATIO =
HH_ELDERLY_COUNT / HH_SIZE
)
train <- train %>%
left_join(
hh_structure_train,
by = "HH_ID"
)
test <- test %>%
left_join(
hh_structure_train,
by = "HH_ID"
)
# Missing handling for test
for(col in c(
"HH_SIZE",
"HH_MEAN_AGE",
"HH_CHILD_COUNT",
"HH_ELDERLY_COUNT",
"HH_ELDERLY_RATIO"
)) {
med_val <- median(
train[[col]],
na.rm = TRUE
)
test <- test %>%
mutate(
!!sym(col) :=
coalesce(
!!sym(col),
med_val
)
)
}
# Final features
train <- train %>%
mutate(
TONG_THU_NHAP =
TIEN_LUONG_12T +
TIEN_LE_TET +
THUONG_PHU_CAP +
LUONG_CONG_VIEC_2,
AGE_SQ = AGE^2,
HAS_INSURANCE =
ifelse(
CO_THE_BAO_HIEM == "1",
1,
0
),
AGE_GROUP = as.factor(
case_when(
AGE <= 30 ~ "Under_30",
AGE <= 50 ~ "Age_30_to_50",
TRUE ~ "Above_50"
)
),
is_sick =
ifelse(y > 1e-8, 1, 0)
)
test <- test %>%
mutate(
TONG_THU_NHAP =
TIEN_LUONG_12T +
TIEN_LE_TET +
THUONG_PHU_CAP +
LUONG_CONG_VIEC_2,
AGE_SQ = AGE^2,
HAS_INSURANCE =
ifelse(
CO_THE_BAO_HIEM == "1",
1,
0
),
AGE_GROUP = as.factor(
case_when(
AGE <= 30 ~ "Under_30",
AGE <= 50 ~ "Age_30_to_50",
TRUE ~ "Above_50"
)
)
)p1 <- ggplot(
train,
aes(x = expm1(y))
) +
geom_histogram(
bins = 50,
fill = "#2c3e50",
color = "white"
) +
theme_minimal() +
labs(
title = "Original Cost Distribution"
)
p2 <- ggplot(
train %>% filter(y > 0),
aes(x = y)
) +
geom_histogram(
bins = 40,
fill = "#27ae60",
color = "white"
) +
theme_minimal() +
labs(
title = "Log-transformed Cost Distribution"
)
grid.arrange(
p1,
p2,
ncol = 2
)cols_to_remove <- c(
"ID",
target_var,
leakage_vars,
"is_sick"
)
train_feats <- train %>%
select(
-any_of(cols_to_remove)
) %>%
mutate(is_train = 1)
test_feats <- test %>%
select(
-any_of(c(
"ID",
leakage_vars,
"is_sick"
))
) %>%
mutate(is_train = 0)
all_data <- bind_rows(
train_feats,
test_feats
)
num_cols <- all_data %>%
select(
where(is.numeric),
-is_train,
-HH_ID
) %>%
names()
for(col in num_cols){
all_data <- all_data %>%
mutate(
!!sym(col) :=
coalesce(
!!sym(col),
median(
all_data[[col]],
na.rm = TRUE
)
)
)
}
old_opts <- options(
na.action = "na.pass"
)
sparse_matrix <- sparse.model.matrix(
~ . - 1,
data = all_data %>%
select(-HH_ID)
)
options(old_opts)
X_train_full <- sparse_matrix[
sparse_matrix[, "is_train"] == 1,
]
X_test <- sparse_matrix[
sparse_matrix[, "is_train"] == 0,
]
X_train_full <- X_train_full[
,
colnames(X_train_full) != "is_train"
]
X_test <- X_test[
,
colnames(X_test) != "is_train"
]
y_train_full <- train$y
is_sick_full <- train$is_sickdtrain_class <- xgb.DMatrix(
data = X_train_full,
label = is_sick_full
)
params_class <- list(
booster = "gbtree",
objective = "binary:logistic",
eval_metric = "auc",
eta = 0.02,
max_depth = 5
)
cv_class <- xgb.cv(
params = params_class,
data = dtrain_class,
nrounds = 300,
nfold = 5,
early_stopping_rounds = 30,
verbose = 0
)
best_iter_class <-
cv_class$best_iteration
if(
is.null(best_iter_class) ||
length(best_iter_class) == 0
){
best_iter_class <- which.max(
cv_class$evaluation_log$
test_auc_mean
)
}
if(
is.null(best_iter_class) ||
length(best_iter_class) == 0
){
best_iter_class <- 300
}
cat(
"Best validation AUC:",
max(
cv_class$evaluation_log$
test_auc_mean
),
"\n"
)## Best validation AUC: 0.6797139
## Best iteration: 297
sick_train_idx <- which(
is_sick_full == 1
)
X_train_sick <-
X_train_full[
sick_train_idx,
]
y_train_sick <-
y_train_full[
sick_train_idx
]
dtrain_reg <- xgb.DMatrix(
data = X_train_sick,
label = y_train_sick
)
params_reg <- list(
booster = "gbtree",
objective = "reg:squarederror",
eval_metric = "rmse",
eta = 0.015,
max_depth = 5
)
cv_reg <- xgb.cv(
params = params_reg,
data = dtrain_reg,
nrounds = 300,
nfold = 5,
early_stopping_rounds = 30,
verbose = 0
)
best_iter_reg <-
cv_reg$best_iteration
if(
is.null(best_iter_reg) ||
length(best_iter_reg) == 0
){
best_iter_reg <- which.min(
cv_reg$evaluation_log$
test_rmse_mean
)
}
if(
is.null(best_iter_reg) ||
length(best_iter_reg) == 0
){
best_iter_reg <- 300
}
cat(
"Best validation RMSE:",
min(
cv_reg$evaluation_log$
test_rmse_mean
),
"\n"
)## Best validation RMSE: 1.755388
## Best iteration: 217
final_preds <-
prob_sick_test *
cost_xgb_test
final_preds[
prob_sick_test < 0.01
] <- 0
final_preds[
final_preds < 0
] <- 0
max_logical_y <- quantile(
y_train_full,
0.995,
na.rm = TRUE
)
final_preds <- pmin(
final_preds,
max_logical_y
)
submission <- data.frame(
ID = test$ID,
y.hat = final_preds
)
write.csv(
submission,
"submission_Actuary_65_Neu_Ha.csv",
row.names = FALSE
)
head(submission)## ID y.hat
## 1 00001037008_3 0.8382454
## 2 00001037012_1 0.5579517
## 3 00001037015_2 2.7620220
## 4 00004004007_5 0.9505327
## 5 00004004007_4 0.7073528
## 6 00004004012_3 0.8862574
```