Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV 108

Ngày 5: Hồi qui logistic

Việc 1. Đọc dữ liệu ‘Oxteo-data.xlsx’ và gọi đối tượng là ‘fx’

library(readxl)
fx = as.data.frame(read_excel("C:\\Thach\\VN trips\\2024_2Aug\\Data Analysis workshop (Hospital 108)\\Datasets\\Osteo-data.xlsx"))
dim(fx)
## [1] 2216   17
summary(fx)
##        id             sex                 age            weight      
##  Min.   :   1.0   Length:2216        Min.   :57.00   Min.   : 34.00  
##  1st Qu.: 554.8   Class :character   1st Qu.:65.00   1st Qu.: 60.00  
##  Median :1108.5   Mode  :character   Median :70.00   Median : 69.00  
##  Mean   :1108.5                      Mean   :70.89   Mean   : 70.14  
##  3rd Qu.:1662.2                      3rd Qu.:76.00   3rd Qu.: 79.00  
##  Max.   :2216.0                      Max.   :96.00   Max.   :133.00  
##                                                      NA's   :53      
##      height         prior_fx          fnbmd           smoking      
##  Min.   :136.0   Min.   :0.0000   Min.   :0.2800   Min.   :0.0000  
##  1st Qu.:158.0   1st Qu.:0.0000   1st Qu.:0.7300   1st Qu.:0.0000  
##  Median :164.0   Median :0.0000   Median :0.8200   Median :0.0000  
##  Mean   :164.9   Mean   :0.1611   Mean   :0.8287   Mean   :0.4176  
##  3rd Qu.:171.0   3rd Qu.:0.0000   3rd Qu.:0.9300   3rd Qu.:1.0000  
##  Max.   :196.0   Max.   :1.0000   Max.   :1.5100   Max.   :1.0000  
##  NA's   :54                       NA's   :89       NA's   :1       
##    parkinson           rheum          hypertension       diabetes    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.00000   Median :0.00000   Median :1.0000   Median :0.000  
##  Mean   :0.06498   Mean   :0.03881   Mean   :0.5063   Mean   :0.111  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.000  
##                                                                      
##       copd           cancer             cvd            falls_n      
##  Min.   :0.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.111   Mean   :0.08529   Mean   :0.3872   Mean   :0.2843  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.0000   Max.   :2.0000  
##                                                                     
##        fx        
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2595  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
## 

Việc 2. Mô tả đặc điểm của dữ liệu theo tình trạng gãy xương

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
fx = fx %>% mutate(
  prior_fx = as.factor(prior_fx),
  falls_n = as.factor(falls_n),
  smoking = as.factor(smoking),
  parkinson = as.factor(parkinson),
  rheum = as.factor(rheum),
  hypertension = as.factor(hypertension),
  diabetes = as.factor(diabetes),
  copd =  as.factor(copd),
  cancer = as.factor(cancer),
  cvd = as.factor(cvd),
  fx = as.factor(fx)
)

library(table1)
## 
## Attaching package: 'table1'
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ sex + age + weight + height + fnbmd + prior_fx + falls_n + smoking + parkinson + rheum + hypertension + diabetes + copd + cancer + cvd | fx, data = fx)
0
(N=1641)
1
(N=575)
Overall
(N=2216)
sex
Female 932 (56.8%) 426 (74.1%) 1358 (61.3%)
Male 709 (43.2%) 149 (25.9%) 858 (38.7%)
age
Mean (SD) 70.2 (6.89) 72.9 (7.58) 70.9 (7.17)
Median [Min, Max] 69.0 [57.0, 94.0] 72.0 [60.0, 96.0] 70.0 [57.0, 96.0]
weight
Mean (SD) 71.5 (14.2) 66.0 (13.3) 70.1 (14.2)
Median [Min, Max] 70.0 [36.0, 133] 65.0 [34.0, 102] 69.0 [34.0, 133]
Missing 23 (1.4%) 30 (5.2%) 53 (2.4%)
height
Mean (SD) 166 (9.40) 162 (8.74) 165 (9.35)
Median [Min, Max] 165 [139, 192] 162 [136, 196] 164 [136, 196]
Missing 24 (1.5%) 30 (5.2%) 54 (2.4%)
fnbmd
Mean (SD) 0.854 (0.151) 0.754 (0.140) 0.829 (0.155)
Median [Min, Max] 0.850 [0.380, 1.51] 0.750 [0.280, 1.25] 0.820 [0.280, 1.51]
Missing 51 (3.1%) 38 (6.6%) 89 (4.0%)
prior_fx
0 1419 (86.5%) 440 (76.5%) 1859 (83.9%)
1 222 (13.5%) 135 (23.5%) 357 (16.1%)
falls_n
0 1275 (77.7%) 459 (79.8%) 1734 (78.2%)
1 252 (15.4%) 82 (14.3%) 334 (15.1%)
2 114 (6.9%) 34 (5.9%) 148 (6.7%)
smoking
0 935 (57.0%) 355 (61.7%) 1290 (58.2%)
1 705 (43.0%) 220 (38.3%) 925 (41.7%)
Missing 1 (0.1%) 0 (0%) 1 (0.0%)
parkinson
0 1537 (93.7%) 535 (93.0%) 2072 (93.5%)
1 104 (6.3%) 40 (7.0%) 144 (6.5%)
rheum
0 1576 (96.0%) 554 (96.3%) 2130 (96.1%)
1 65 (4.0%) 21 (3.7%) 86 (3.9%)
hypertension
0 805 (49.1%) 289 (50.3%) 1094 (49.4%)
1 836 (50.9%) 286 (49.7%) 1122 (50.6%)
diabetes
0 1456 (88.7%) 514 (89.4%) 1970 (88.9%)
1 185 (11.3%) 61 (10.6%) 246 (11.1%)
copd
0 1452 (88.5%) 518 (90.1%) 1970 (88.9%)
1 189 (11.5%) 57 (9.9%) 246 (11.1%)
cancer
0 1503 (91.6%) 524 (91.1%) 2027 (91.5%)
1 138 (8.4%) 51 (8.9%) 189 (8.5%)
cvd
0 993 (60.5%) 365 (63.5%) 1358 (61.3%)
1 648 (39.5%) 210 (36.5%) 858 (38.7%)

Việc 3. Đánh giá mối liên quan giữa MĐX cổ xương đùi và nguy cơ gãy xương

3.1 Biễu đồ hợp thể hiện mối liên quan giữa MĐX và nguy cơ gãy xương

p = ggplot(data = fx, aes(x = fx, y = fnbmd, fill = fx, col = fx))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) + labs(x = "Gãy xương", y = "MĐX cổ xương đùi (g/cm2)") + ggtitle("Liên quan giữa MĐX và khả năng bị gãy xương")
p1
## Warning: Removed 89 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 89 rows containing missing values (`geom_point()`).

3.2 Mô hình hồi qui logistic đạnh giá mối liên quan giữa MĐX và nguy cơ gãy xương

m.bmd = glm(fx ~ fnbmd, family = "binomial", data = fx)
summary(m.bmd)
## 
## Call:
## glm(formula = fx ~ fnbmd, family = "binomial", data = fx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.7609     0.3055   9.036   <2e-16 ***
## fnbmd        -4.7917     0.3856 -12.427   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2403.6  on 2126  degrees of freedom
## Residual deviance: 2222.8  on 2125  degrees of freedom
##   (89 observations deleted due to missingness)
## AIC: 2226.8
## 
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Warning: package 'epiDisplay' was built under R version 4.3.2
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
logistic.display(m.bmd)
## 
## Logistic regression predicting fx : 1 vs 0 
##  
##                    OR(95%CI)      P(Wald's test) P(LR-test)
## fnbmd (cont. var.) 0.01 (0,0.02)  < 0.001        < 0.001   
##                                                            
## Log-likelihood = -1111.4092
## No. of observations = 2127
## AIC value = 2226.8184
library(Publish)
## Loading required package: prodlim
publish(m.bmd)
##  Variable Units OddsRatio       CI.95 p-value 
##     fnbmd            0.01 [0.00;0.02] < 1e-04

Nhận xét: gia tăng mỗi 1 g/cm2 không thực tế

Khuyến cáo: mối liên quan giữa giảm 1 độ lệch chuẩn MĐX và nguy cơ gãy xương

fx$fnbmd.sd = fx$fnbmd/(-0.155)

m.bmd.sd = glm(fx ~ fnbmd.sd, family = "binomial", data = fx)
summary(m.bmd.sd)
## 
## Call:
## glm(formula = fx ~ fnbmd.sd, family = "binomial", data = fx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.76088    0.30554   9.036   <2e-16 ***
## fnbmd.sd     0.74271    0.05976  12.427   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2403.6  on 2126  degrees of freedom
## Residual deviance: 2222.8  on 2125  degrees of freedom
##   (89 observations deleted due to missingness)
## AIC: 2226.8
## 
## Number of Fisher Scoring iterations: 4
publish(m.bmd.sd)
##  Variable Units OddsRatio       CI.95 p-value 
##  fnbmd.sd            2.10 [1.87;2.36] < 1e-04

Việc 4. Đánh giá mối liên quan giữa giới tính và nguy cơ gãy xương

m.sex = glm(fx ~ sex, family = "binomial", data = fx)
summary(m.sex)
## 
## Call:
## glm(formula = fx ~ sex, family = "binomial", data = fx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.78289    0.05848 -13.386  < 2e-16 ***
## sexMale     -0.77702    0.10743  -7.232 4.74e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2537.4  on 2215  degrees of freedom
## Residual deviance: 2481.6  on 2214  degrees of freedom
## AIC: 2485.6
## 
## Number of Fisher Scoring iterations: 4
publish(m.sex)
##  Variable  Units OddsRatio       CI.95 p-value 
##       sex Female       Ref                     
##             Male      0.46 [0.37;0.57]  <1e-04

Việc 5. Đánh giá mối liên quan giữa MĐX và giới tính với nguy cơ gãy xương

5.1 Đánh giá mối liên quan giữa 3 yếu tố trên

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
vars = fx[, c("fnbmd", "sex", "fx")]
ggpairs(data = vars, mapping = aes(color = fx))
## Warning: Removed 89 rows containing non-finite values (`stat_density()`).
## Warning: Removed 89 rows containing non-finite values (`stat_boxplot()`).
## Removed 89 rows containing non-finite values (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 89 rows containing non-finite values (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 89 rows containing non-finite values (`stat_bin()`).

5.2 Đánh giá mối liên quan giữa MĐX và nguy cơ gãy xương sau khi hiệu chỉnh cho giới tính

m.multi = glm(fx ~ fnbmd.sd + sex, family = "binomial", data = fx)
summary(m.multi)
## 
## Call:
## glm(formula = fx ~ fnbmd.sd + sex, family = "binomial", data = fx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.62064    0.31432   8.337   <2e-16 ***
## fnbmd.sd     0.70112    0.06367  11.012   <2e-16 ***
## sexMale     -0.22252    0.12112  -1.837   0.0662 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2403.6  on 2126  degrees of freedom
## Residual deviance: 2219.4  on 2124  degrees of freedom
##   (89 observations deleted due to missingness)
## AIC: 2225.4
## 
## Number of Fisher Scoring iterations: 4
publish(m.multi)
##  Variable  Units OddsRatio       CI.95   p-value 
##  fnbmd.sd             2.02 [1.78;2.28]   < 1e-04 
##       sex Female       Ref                       
##             Male      0.80 [0.63;1.01]   0.06618

Việc 6. Xây dựng mô hình dự báo nguy cơ gãy xương

6.1 Xây dựng mô hình dự báo gãy xương

library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
fx.2 = na.omit(fx)
bma.fx = bic.glm(fx ~ sex + age + weight + height + fnbmd + prior_fx + falls_n + smoking + parkinson + rheum + hypertension + diabetes + copd + cancer + cvd, data = fx.2, strict = F, OR = 20, glm.family = "binomial")
summary(bma.fx)
## 
## Call:
## bic.glm.formula(f = fx ~ sex + age + weight + height + fnbmd +     prior_fx + falls_n + smoking + parkinson + rheum + hypertension +     diabetes + copd + cancer + cvd, data = fx.2, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##                 p!=0    EV        SD        model 1     model 2     model 3   
## Intercept       100     2.249962  0.674258   2.509e+00   2.340e+00   1.140e+00
## sexMale          18.2  -0.048048  0.114551       .      -2.578e-01       .    
## age              16.2   0.002663  0.006838       .           .       1.602e-02
## weight            0.0   0.000000  0.000000       .           .           .    
## height            0.0   0.000000  0.000000       .           .           .    
## fnbmd           100.0  -4.487636  0.436674  -4.595e+00  -4.279e+00  -4.302e+00
## prior_fx        100.0                                                         
##         .1              0.530729  0.134289   5.306e-01   5.445e-01   5.156e-01
## falls_n           0.0                                                         
##        .1               0.000000  0.000000       .           .           .    
##        .2               0.000000  0.000000       .           .           .    
## smoking           0.0                                                         
##        .1               0.000000  0.000000       .           .           .    
## parkinson         0.0                                                         
##          .1             0.000000  0.000000       .           .           .    
## rheum             0.0                                                         
##      .1                 0.000000  0.000000       .           .           .    
## hypertension      0.0                                                         
##             .1          0.000000  0.000000       .           .           .    
## diabetes          0.0                                                         
##         .1              0.000000  0.000000       .           .           .    
## copd              0.0                                                         
##     .1                  0.000000  0.000000       .           .           .    
## cancer            0.0                                                         
##       .1                0.000000  0.000000       .           .           .    
## cvd               0.0                                                         
##    .1                   0.000000  0.000000       .           .           .    
##                                                                               
## nVar                                           2           3           3      
## BIC                                         -1.402e+04  -1.402e+04  -1.402e+04
## post prob                                    0.696       0.142       0.122    
##                 model 4   
## Intercept        7.978e-01
## sexMale         -2.851e-01
## age              1.780e-02
## weight               .    
## height               .    
## fnbmd           -3.918e+00
## prior_fx                  
##         .1       5.296e-01
## falls_n                   
##        .1            .    
##        .2            .    
## smoking                   
##        .1            .    
## parkinson                 
##          .1          .    
## rheum                     
##      .1              .    
## hypertension              
##             .1       .    
## diabetes                  
##         .1           .    
## copd                      
##     .1               .    
## cancer                    
##       .1             .    
## cvd                       
##    .1                .    
##                           
## nVar               4      
## BIC             -1.402e+04
## post prob        0.040    
## 
##   1  observations deleted due to missingness.
imageplot.bma(bma.fx)

### 6.2 Trình bày kết quả

m.final = glm(fx ~ fnbmd.sd + prior_fx, family = "binomial", data = fx)
summary(m.final)
## 
## Call:
## glm(formula = fx ~ fnbmd.sd + prior_fx, family = "binomial", 
##     data = fx)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.49096    0.31301   7.958 1.75e-15 ***
## fnbmd.sd     0.70829    0.06033  11.740  < 2e-16 ***
## prior_fx1    0.52983    0.13386   3.958 7.55e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2403.6  on 2126  degrees of freedom
## Residual deviance: 2207.6  on 2124  degrees of freedom
##   (89 observations deleted due to missingness)
## AIC: 2213.6
## 
## Number of Fisher Scoring iterations: 4
publish(m.final)
##  Variable Units OddsRatio       CI.95 p-value 
##  fnbmd.sd            2.03 [1.80;2.29]  <1e-04 
##  prior_fx     0       Ref                     
##               1      1.70 [1.31;2.21]  <1e-04

Việc 7. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs (https://rpubs.com/ThachTran/1214079)