Mô hình hồi quy logistic

Package sẽ sử dụng: epiDisplay, BMA, GGally Dữ liệu thực hành: “Hip fracture data.csv” (loãng xương) và “wdbc.csv” (ung thư vú)

Task 1: Phân tích mô tả

os = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Hip fracture data.csv")
str(os$hipfx)
##  int [1:2788] 0 0 0 0 0 0 0 0 0 0 ...
os$hipfx = as.factor(os$hipfx)
str(os$hipfx)
##  Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
head(os)
##   id     dov gender age      dob visit   v1   v2    v3    v4 wt bmi  ht v5
## 1  3 15/6/89   Male  73   8/6/16     1 0.98 0.88 1.079 1.458 98  32 175 NA
## 2  8 17/4/89 Female  67 11/12/21     1 0.85 0.85 0.966 1.325 72  26 166 18
## 3  9 12/6/90   Male  68   8/1/22     1 0.87 0.84 1.013 1.494 87  26 184 36
## 4 10  4/6/90 Female  62  15/5/28     1 0.62 0.71 0.839 1.214 72  24 173 NA
## 5 23  8/8/89   Male  61  22/9/28     1 0.87 0.60 0.811 1.144 72  24 173 44
## 6 24  3/5/89 Female  76   1/8/13     1 0.76 0.58 0.743 0.980 67  28 156 15
##     v6 v7 v8 v9 hipfx timehip
## 1 39.9  1  0  0     0    0.55
## 2 31.0  0  0  0     0   19.68
## 3 28.6  0  0  0     0    5.05
## 4 28.2  1  0  0     0   18.55
## 5 28.9  1  0  0     0   19.37
## 6 33.3  0  0  0     0   12.30
# Phân tích mô tả các biến gender, age, v1, v2, v3, v4, wt, ht, bmi theo hipfx
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~gender + age + v1 + v2 + v3 + v4 + wt + ht + bmi | hipfx, data=os)
0
(n=2599)
1
(n=189)
Overall
(n=2788)
gender
Female 1512 (58.2%) 142 (75.1%) 1654 (59.3%)
Male 1087 (41.8%) 47 (24.9%) 1134 (40.7%)
age
Mean (SD) 68.9 (6.35) 75.3 (7.74) 69.3 (6.65)
Median [Min, Max] 67.0 [60.0, 94.0] 75.0 [60.0, 96.0] 68.0 [60.0, 96.0]
v1
Mean (SD) 0.820 (0.310) 0.637 (0.125) 0.809 (0.305)
Median [Min, Max] 0.810 [0.0900, 12.2] 0.620 [0.430, 0.970] 0.800 [0.0900, 12.2]
Missing 51 (2.0%) 16 (8.5%) 67 (2.4%)
v2
Mean (SD) 0.744 (1.89) 0.520 (0.129) 0.730 (1.83)
Median [Min, Max] 0.700 [0.220, 96.0] 0.500 [0.170, 1.01] 0.690 [0.170, 96.0]
Missing 50 (1.9%) 15 (7.9%) 65 (2.3%)
v3
Mean (SD) 0.874 (0.150) 0.692 (0.123) 0.862 (0.155)
Median [Min, Max] 0.866 [0.378, 1.51] 0.683 [0.279, 1.16] 0.855 [0.279, 1.51]
Missing 50 (1.9%) 15 (7.9%) 65 (2.3%)
v4
Mean (SD) 1.17 (0.226) 0.996 (0.218) 1.16 (0.230)
Median [Min, Max] 1.15 [0.389, 2.38] 0.976 [0.358, 1.83] 1.14 [0.358, 2.38]
Missing 56 (2.2%) 11 (5.8%) 67 (2.4%)
wt
Mean (SD) 74.1 (15.3) 62.4 (13.3) 73.3 (15.4)
Median [Min, Max] 73.0 [36.0, 149] 60.5 [35.0, 101] 72.0 [35.0, 149]
Missing 19 (0.7%) 13 (6.9%) 32 (1.1%)
ht
Mean (SD) 166 (9.23) 161 (9.72) 165 (9.34)
Median [Min, Max] 165 [143, 192] 160 [136, 190] 165 [136, 192]
Missing 21 (0.8%) 13 (6.9%) 34 (1.2%)
bmi
Mean (SD) 26.9 (4.73) 23.9 (3.86) 26.7 (4.74)
Median [Min, Max] 26.0 [15.0, 57.0] 24.0 [15.0, 36.0] 26.0 [15.0, 57.0]
Missing 21 (0.8%) 13 (6.9%) 34 (1.2%)
# Hiển thị dữ liệu với GGally
dat = os[, c("gender", "age", "v1", "v2", "v3", "v4", "wt", "ht", "bmi", "hipfx")]
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(dat, mapping = aes(color = hipfx))
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).

## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).

## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 65 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 65 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 32 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 99 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 73 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## Warning: Removed 65 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 65 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 97 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 71 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## Warning: Removed 65 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 97 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 71 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 97 rows containing missing values (geom_point).

## Warning: Removed 97 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 73 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## Warning: Removed 32 rows containing missing values (geom_point).
## Warning: Removed 73 rows containing missing values (geom_point).
## Warning: Removed 71 rows containing missing values (geom_point).

## Warning: Removed 71 rows containing missing values (geom_point).
## Warning: Removed 73 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).

## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).

## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).

## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).

Task 2: Mô hình hồi quy tuyến tính đơn giản

# Khác biệt về odds gãy xương giữa nam và nữ?
m = glm(hipfx ~ gender, family=binomial, data=os)
summary(m)
## 
## Call:
## glm(formula = hipfx ~ gender, family = binomial, data = os)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4237  -0.4237  -0.4237  -0.2910   2.5232  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.36536    0.08777 -26.949  < 2e-16 ***
## genderMale  -0.77567    0.17289  -4.486 7.24e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1382.2  on 2787  degrees of freedom
## Residual deviance: 1360.0  on 2786  degrees of freedom
## AIC: 1364
## 
## Number of Fisher Scoring iterations: 5
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
logistic.display(m)
## 
## Logistic regression predicting hipfx : 1 vs 0 
##  
##                        OR(95%CI)         P(Wald's test) P(LR-test)
## gender: Male vs Female 0.46 (0.33,0.65)  < 0.001        < 0.001   
##                                                                   
## Log-likelihood = -679.98
## No. of observations = 2788
## AIC value = 1363.96
# Mối liên quan giữa v3 và hipfx
m = glm(hipfx ~ gender + v3, family=binomial, data=os)
logistic.display(m)
## 
## Logistic regression predicting hipfx : 1 vs 0 
##  
##                        crude OR(95%CI)  adj. OR(95%CI)    P(Wald's test)
## gender: Male vs Female 0.49 (0.35,0.7)  1.31 (0.89,1.94)  0.169         
##                                                                         
## v3 (cont. var.)        0 (0,0)          0 (0,0)           < 0.001       
##                                                                         
##                        P(LR-test)
## gender: Male vs Female 0.173     
##                                  
## v3 (cont. var.)        < 0.001   
##                                  
## Log-likelihood = -517.5989
## No. of observations = 2723
## AIC value = 1041.1979
# Tại sao OR = 0? Vì OR quá nhỏ nên làm tròn bằng 0 (thực ra exp(-9.8765) = 0.00005136775).
# Tính OR cho mỗi đơn vị thay đổi của standard deviation (sd)
sd(na.omit(os$v3))
## [1] 0.1546085
os$zv3 = os$v3 / sd(na.omit(os$v3))
m = glm(hipfx ~ gender + zv3, family=binomial, data=os)
logistic.display(m)
## 
## Logistic regression predicting hipfx : 1 vs 0 
##  
##                        crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test)
## gender: Male vs Female 0.49 (0.35,0.7)   1.31 (0.89,1.94)  0.169         
##                                                                          
## zv3 (cont. var.)       0.23 (0.18,0.28)  0.22 (0.17,0.27)  < 0.001       
##                                                                          
##                        P(LR-test)
## gender: Male vs Female 0.173     
##                                  
## zv3 (cont. var.)       < 0.001   
##                                  
## Log-likelihood = -517.5989
## No. of observations = 2723
## AIC value = 1041.1979

Odds gãy xương ở nam giới bằng 0.46 (KTC 95% 0.33 - 0.65) odds gãy xương ở nữ giới. So với nữ giới, odds gãy xương ở nam giới thấp hơn 54% (KTC 95% 67% - 35%).

Task 3: Chọn biến liên quan dùng BMA

# Định nghĩa biến x và y
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
xvars = os[, c("gender", "age", "wt", "ht", "bmi", "v1", "v2", "v3", "v4")]
yvar = os$hipfx

# Tìm biến liên quan:
m = bic.glm(xvars, yvar, strict=F, OR=20, glm.family="binomial")
## Warning in bic.glm.data.frame(xvars, yvar, strict = F, OR = 20, glm.family
## = "binomial"): There were 106 records deleted due to NA's
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   5  models were selected
##  Best  5  models (cumulative posterior probability =  1 ): 
## 
##                p!=0    EV         SD        model 1     model 2   
## Intercept      100    -0.2193903  1.369781  -1.060e-01  -2.015e-01
## gender.x         0.0                                              
##         .Male          0.0000000  0.000000       .           .    
## age.x          100.0   0.0555837  0.011979   5.535e-02   5.718e-02
## wt.x             0.0   0.0000000  0.000000       .           .    
## ht.x             4.8   0.0008384  0.004426       .           .    
## bmi.x            6.4  -0.0025378  0.011323       .           .    
## v1.x            11.1  -0.2275109  0.728763       .      -2.059e+00
## v2.x             6.5  -0.1607453  0.711598       .           .    
## v3.x           100.0  -7.9911353  1.178497  -8.357e+00  -6.505e+00
## v4.x             0.0   0.0000000  0.000000       .           .    
##                                                                   
## nVar                                           2           3      
## BIC                                         -2.015e+04  -2.015e+04
## post prob                                    0.712       0.111    
##                model 3     model 4     model 5   
## Intercept      -2.962e-01   4.742e-01  -2.750e+00
## gender.x                                         
##         .Male       .           .           .    
## age.x           5.399e-02   5.519e-02   5.805e-02
## wt.x                .           .           .    
## ht.x                .           .       1.737e-02
## bmi.x               .      -3.964e-02       .    
## v1.x                .           .           .    
## v2.x           -2.482e+00       .           .    
## v3.x           -6.050e+00  -7.794e+00  -8.860e+00
## v4.x                .           .           .    
##                                                  
## nVar              3           3           3      
## BIC            -2.015e+04  -2.015e+04  -2.015e+04
## post prob       0.065       0.064       0.048
# Biểu đồ trình bày các mô hình có khả năng khi tiên lượng hipfx
imageplot.bma(m)

# Đánh giá tầm quan trọng
m = glm(hipfx ~ age + v3, family=binomial, data=os)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
## 
##     dotplot
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
varImp(m, scale=F)
##       Overall
## age  4.593955
## v3  11.265944