Untitled

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-8

# Đọc dữ liệu
TOCAhq <- read_excel("C:/dl/data/R-Rstudio/TOCAhq.xlsx", 
    col_types = c("text", "numeric", "numeric", 
        "text", "text", "numeric", "numeric", 
        "text", "text", "numeric", "numeric", 
        "numeric", "text", "numeric", "numeric", 
        "numeric", "numeric", "text", "numeric", 
        "text", "text", "text", "numeric", "numeric", "text", "numeric", "numeric", "text",  
        "numeric", "numeric", "numeric", 
        "text", "text", "text","text", "text", "text", "text", "text", 
        "text", "text", "numeric", "numeric"))

# Hàm để thay thế giá trị thiếu bằng số trung bình
replace_na_numeric <- function(df, vars) {
  for (var in vars) {
    if (var %in% colnames(df)) {
      mean_value <- mean(df[[var]], na.rm = TRUE)
      df[[var]][is.na(df[[var]])] <- mean_value
    }
  }
  return(df)
}

# Hàm để thay thế giá trị thiếu bằng mode
replace_na_factor <- function(df, vars) {
  for (var in vars) {
    if (var %in% colnames(df)) {
      mode_value <- names(sort(table(df[[var]]), decreasing = TRUE))[1]
      df[[var]][is.na(df[[var]])] <- mode_value
    }
  }
  return(df)
}

# Thay thế giá trị thiếu cho các biến định lượng
TOCAhq <- replace_na_numeric(TOCAhq, c("cTnI-hs2", "dapt", "HDLC"))

# Thay thế giá trị thiếu cho các biến định tính
TOCAhq <- replace_na_factor(TOCAhq, c("THBH", "RENTROP"))


# Tạo ma trận thiết kế
xvars <- model.matrix(TOCA ~ age + classbmi + tha + dysl + htl + KILLIP1 + cre + HDLC + LVEF40 + RWMA + LAD + LCx + RCA + Multivesseld + THBH , data = TOCAhq)[,-1]  # Loại bỏ cột intercept
yvar <- TOCAhq$TOCA

# Hồi quy Ridge và Lasso
ridge <- cv.glmnet(xvars, yvar, alpha = 0)
plot(ridge)

coef(ridge)

## 16 x 1 sparse Matrix of class "dgCMatrix"
##                          s1
## (Intercept)    0.3698394201
## age           -0.0009336414
## classbmi1      0.0219592219
## tha           -0.0326457062
## dysl1         -0.0229749300
## htl1          -0.0196919927
## KILLIP11       0.0239279218
## cre           -0.0003980327
## HDLC          -0.0247399135
## LVEF401        0.0209241442
## RWMA1          0.0251992826
## LAD1          -0.0432814017
## LCx1           0.0326721907
## RCA1           0.0182150007
## Multivesseld1  0.0222541093
## THBH1          0.0735823677

lasso <- cv.glmnet(xvars, yvar, alpha = 1)
plot(lasso)

coef(lasso)

## 16 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept)    0.25881822
## age            .         
## classbmi1      .         
## tha            .         
## dysl1          .         
## htl1           .         
## KILLIP11       .         
## cre            .         
## HDLC           .         
## LVEF401        .         
## RWMA1          .         
## LAD1          -0.07654126
## LCx1           .         
## RCA1           .         
## Multivesseld1  .         
## THBH1          0.13638176

# Kết quả LASSO cho thấy có các biến như age, tha, dysl, htl, cre, RWMA, Culprit là nhánh LAD và THBH tiên lượng tốt mà không có hiện tượng đa cộng tuyến ## Đưa các biến  LAD và THBH vào chạy đa biến
# Tạo biến giả từ biến phân loại Culprit
xvars_full <- model.matrix(~ Culprit - 1, data = TOCAhq)

# Chọn biến Culprit2 và 3 từ ma trận thiết kế
xvars_selected <- xvars_full[, "Culprit2", drop = FALSE]
xvars_selected1 <- xvars_full[, "Culprit3", drop = FALSE]
temp1 <- glm(TOCA ~ age + tha + dysl + htl + cre + RWMA + LAD + THBH, family="binomial", data=TOCAhq)
search1 <- step(temp1)

## Start:  AIC=138.29
## TOCA ~ age + tha + dysl + htl + cre + RWMA + LAD + THBH
## 
##        Df Deviance    AIC
## <none>      120.29 138.29
## - htl   1   122.67 138.67
## - tha   1   123.05 139.05
## - dysl  1   125.31 141.31
## - age   1   126.47 142.47
## - RWMA  1   126.56 142.56
## - cre   1   127.67 143.67
## - THBH  1   132.55 148.55
## - LAD   1   136.95 152.95

summary(search1)

## 
## Call:
## glm(formula = TOCA ~ age + tha + dysl + htl + cre + RWMA + LAD + 
##     THBH, family = "binomial", data = TOCAhq)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  6.59843    2.22444   2.966 0.003014 ** 
## age         -0.05041    0.02119  -2.379 0.017349 *  
## tha         -1.18802    0.70730  -1.680 0.093024 .  
## dysl1       -1.08894    0.49558  -2.197 0.027999 *  
## htl1        -0.84334    0.56465  -1.494 0.135292    
## cre         -0.03450    0.01631  -2.114 0.034474 *  
## RWMA1        1.19295    0.49341   2.418 0.015616 *  
## LAD1        -2.01555    0.54844  -3.675 0.000238 ***
## THBH1        2.00374    0.59456   3.370 0.000751 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 174.33  on 157  degrees of freedom
## Residual deviance: 120.29  on 149  degrees of freedom
## AIC: 138.29
## 
## Number of Fisher Scoring iterations: 6

# Sử dụng hàm step để chọn biến
search <- step(temp1)

## Start:  AIC=138.29
## TOCA ~ age + tha + dysl + htl + cre + RWMA + LAD + THBH
## 
##        Df Deviance    AIC
## <none>      120.29 138.29
## - htl   1   122.67 138.67
## - tha   1   123.05 139.05
## - dysl  1   125.31 141.31
## - age   1   126.47 142.47
## - RWMA  1   126.56 142.56
## - cre   1   127.67 143.67
## - THBH  1   132.55 148.55
## - LAD   1   136.95 152.95

# Tính OR và khoảng tin cậy 95%
OR <- exp(coef(temp1))
conf_int <- exp(confint(temp1))

## Waiting for profiling to be done...

results <- cbind(OR, conf_int)
colnames(results) <- c("OR", "2.5%", "97.5%")
results

##                      OR        2.5%        97.5%
## (Intercept) 733.9448633 12.42816070 7.838612e+04
## age           0.9508423  0.91016727 9.896660e-01
## tha           0.3048256  0.07476940 1.246025e+00
## dysl1         0.3365722  0.12322259 8.733031e-01
## htl1          0.4302708  0.13314003 1.248125e+00
## cre           0.9660919  0.93392370 9.933429e-01
## RWMA1         3.2968059  1.28904324 9.062632e+00
## LAD1          0.1332467  0.04183658 3.668255e-01
## THBH1         7.4167485  2.39206306 2.522522e+01

Untitled

Hải Vũ

2024-09-09