Nhập dữ liệu vào R và kiểm tra dữ liệu thiếu trong mẫu:

library(mlbench)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(glmnet)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-10

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

data("PimaIndiansDiabetes2")
df <- PimaIndiansDiabetes2
missing_rate <- colMeans(is.na(df))
print(round(missing_rate * 100, 1))

## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##      0.0      0.7      4.6     29.6     48.7      1.4      0.0      0.0 
## diabetes 
##      0.0

Bước 1: Thực hiện MI dữ liệu:

# Tái lập kết quả MI
set.seed(123) 
# m = 10: Tạo 10 bộ dữ liệu được quy nạp
# method = 'pmm': Predictive Mean Matching (chuẩn cho biến liên tục)
imp_data <- mice(df, m = 10, method = "pmm", printFlag = FALSE)
# Kiểm tra một bộ dữ liệu đã quy nạp:
head(complete(imp_data, 1))

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35     285 33.6    0.627  50      pos
## 2        1      85       66      29      88 26.6    0.351  31      neg
## 3        8     183       64       8     185 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74      20     165 25.6    0.201  30      neg

Bước 2: Thiết lập Vòng lặp phân tích Ridge và Lasso trên từng bộ dữ liệu:

n_vars <- ncol(df) # 9 cột (8 biến + 1 outcome)
variable_names <- c("(Intercept)", colnames(df)[-9]) # Tên biến
# Ma trận lưu kết quả Ridge và Lasso (số dòng = số biến, số cột = số lần quy nạp m)
coef_matrix_ridge <- matrix(NA, nrow = length(variable_names), ncol = 10)
coef_matrix_lasso <- matrix(NA, nrow = length(variable_names), ncol = 10)
rownames(coef_matrix_ridge) <- variable_names
rownames(coef_matrix_lasso) <- variable_names
# ---BẮT ĐẦU VÒNG LẶP TRÊN 10 BỘ DỮ LIỆU ---
for (i in 1:10) {
  # a. Lấy dữ liệu quy nạp thứ i
  current_data <- complete(imp_data, i)
  # b. Chuẩn bị ma trận X và y
  y_tmp <- ifelse(current_data$diabetes == "pos", 1, 0)
  X_tmp <- current_data %>% select(-diabetes) %>% as.matrix()
  # c. Chạy RIDGE (alpha = 0)
  cv_ridge <- cv.glmnet(X_tmp, y_tmp, family = "binomial", alpha = 0)
  # Lưu hệ số tại lambda.min
  coef_matrix_ridge[, i] <- as.vector(coef(cv_ridge, s = "lambda.min"))
  # d. Chạy LASSO (alpha = 1)
  cv_lasso <- cv.glmnet(X_tmp, y_tmp, family = "binomial", alpha = 1)
  # Lưu hệ số tại lambda.min
  coef_matrix_lasso[, i] <- as.vector(coef(cv_lasso, s = "lambda.min"))
}

Bước 3: Tổng hợp kết quả (Pooling):

# --- XỬ LÝ KẾT QUẢ RIDGE ---
ridge_final_coef <- rowMeans(coef_matrix_ridge)
print(ridge_final_coef)

##   (Intercept)      pregnant       glucose      pressure       triceps 
## -7.9822035336  0.0987452934  0.0299788871 -0.0032156685  0.0152418200 
##       insulin          mass      pedigree           age 
##  0.0002124149  0.0651977622  0.7322225844  0.0130237716

# --- XỬ LÝ KẾT QUẢ LASSO ---
# 1. Tính tần suất biến được chọn (Hệ số != 0)
lasso_selection_freq <- rowMeans(coef_matrix_lasso != 0) * 100
# 2. Tính hệ số trung bình
lasso_avg_coef <- rowMeans(coef_matrix_lasso)
results_lasso <- data.frame(
  Variable = variable_names,
  Selection_Frequency_Pct = lasso_selection_freq,
  Avg_Coefficient = lasso_avg_coef
)
print(results_lasso)

##                Variable Selection_Frequency_Pct Avg_Coefficient
## (Intercept) (Intercept)                     100   -8.7872646275
## pregnant       pregnant                     100    0.1123899296
## glucose         glucose                     100    0.0359281830
## pressure       pressure                      60   -0.0037311023
## triceps         triceps                      80    0.0105870075
## insulin         insulin                      60   -0.0004024512
## mass               mass                     100    0.0765334662
## pedigree       pedigree                     100    0.7545354947
## age                 age                     100    0.0096502656

# Biểu đồ so sánh trực quan hệ số Ridge và Lasso
## TẠO DATAFRAME TỔNG HỢP CHO BIỂU ĐỒ
# Kết hợp hệ số Ridge và Lasso vào một dataframe duy nhất
# Loại bỏ hệ số chặn (Intercept) để so sánh trực quan các biến dự đoán
df_comparison <- data.frame(
  Variable = variable_names[-1], # Loại bỏ Intercept
  Ridge_Coef = ridge_final_coef[-1],
  Lasso_Coef = lasso_avg_coef[-1]
)
# Chuyển đổi định dạng để vẽ biểu đồ (long format)
df_long <- df_comparison %>%
  pivot_longer(
    cols = c(Ridge_Coef, Lasso_Coef),
    names_to = "Model",
    values_to = "Coefficient"
  )
# VẼ BIỂU ĐỒ BẰNG GGOPLOT2
ggplot(df_long, aes(x = Variable, y = Coefficient, fill = Model)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8), width = 0.7) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  labs(
    title = "So sánh Hệ số Hồi quy Ridge và Lasso (Pooled MI)",
    y = "Giá trị Hệ số (Coefficient)",
    x = "Biến Dự đoán"
  ) +
  scale_fill_manual(values = c("Ridge_Coef" = "#00BFC4", "Lasso_Coef" = "#F8766D")) +
  theme_minimal() +
  coord_flip()

Phân tích dữ liệu Pima2 sử dụng phương pháp hồi qui Ridge và Lasso kết hợp Multiple Imputation

Trần Dương Đăng Khoa

2025-12-02

Nhập dữ liệu vào R và kiểm tra dữ liệu thiếu trong mẫu:

Bước 1: Thực hiện MI dữ liệu:

Bước 2: Thiết lập Vòng lặp phân tích Ridge và Lasso trên từng bộ dữ liệu:

Bước 3: Tổng hợp kết quả (Pooling):