Package sẽ sử dụng: epiDisplay, BMA, GGally Dữ liệu thực hành: “Hip fracture data.csv” (loãng xương) và “wdbc.csv” (ung thư vú)
os = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Hip fracture data.csv")
str(os$hipfx)
## int [1:2788] 0 0 0 0 0 0 0 0 0 0 ...
os$hipfx = as.factor(os$hipfx)
str(os$hipfx)
## Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
head(os)
## id dov gender age dob visit v1 v2 v3 v4 wt bmi ht v5
## 1 3 15/6/89 Male 73 8/6/16 1 0.98 0.88 1.079 1.458 98 32 175 NA
## 2 8 17/4/89 Female 67 11/12/21 1 0.85 0.85 0.966 1.325 72 26 166 18
## 3 9 12/6/90 Male 68 8/1/22 1 0.87 0.84 1.013 1.494 87 26 184 36
## 4 10 4/6/90 Female 62 15/5/28 1 0.62 0.71 0.839 1.214 72 24 173 NA
## 5 23 8/8/89 Male 61 22/9/28 1 0.87 0.60 0.811 1.144 72 24 173 44
## 6 24 3/5/89 Female 76 1/8/13 1 0.76 0.58 0.743 0.980 67 28 156 15
## v6 v7 v8 v9 hipfx timehip
## 1 39.9 1 0 0 0 0.55
## 2 31.0 0 0 0 0 19.68
## 3 28.6 0 0 0 0 5.05
## 4 28.2 1 0 0 0 18.55
## 5 28.9 1 0 0 0 19.37
## 6 33.3 0 0 0 0 12.30
# Phân tích mô tả các biến gender, age, v1, v2, v3, v4, wt, ht, bmi theo hipfx
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~gender + age + v1 + v2 + v3 + v4 + wt + ht + bmi | hipfx, data=os)
| 0 (n=2599) |
1 (n=189) |
Overall (n=2788) |
|
|---|---|---|---|
| gender | |||
| Female | 1512 (58.2%) | 142 (75.1%) | 1654 (59.3%) |
| Male | 1087 (41.8%) | 47 (24.9%) | 1134 (40.7%) |
| age | |||
| Mean (SD) | 68.9 (6.35) | 75.3 (7.74) | 69.3 (6.65) |
| Median [Min, Max] | 67.0 [60.0, 94.0] | 75.0 [60.0, 96.0] | 68.0 [60.0, 96.0] |
| v1 | |||
| Mean (SD) | 0.820 (0.310) | 0.637 (0.125) | 0.809 (0.305) |
| Median [Min, Max] | 0.810 [0.0900, 12.2] | 0.620 [0.430, 0.970] | 0.800 [0.0900, 12.2] |
| Missing | 51 (2.0%) | 16 (8.5%) | 67 (2.4%) |
| v2 | |||
| Mean (SD) | 0.744 (1.89) | 0.520 (0.129) | 0.730 (1.83) |
| Median [Min, Max] | 0.700 [0.220, 96.0] | 0.500 [0.170, 1.01] | 0.690 [0.170, 96.0] |
| Missing | 50 (1.9%) | 15 (7.9%) | 65 (2.3%) |
| v3 | |||
| Mean (SD) | 0.874 (0.150) | 0.692 (0.123) | 0.862 (0.155) |
| Median [Min, Max] | 0.866 [0.378, 1.51] | 0.683 [0.279, 1.16] | 0.855 [0.279, 1.51] |
| Missing | 50 (1.9%) | 15 (7.9%) | 65 (2.3%) |
| v4 | |||
| Mean (SD) | 1.17 (0.226) | 0.996 (0.218) | 1.16 (0.230) |
| Median [Min, Max] | 1.15 [0.389, 2.38] | 0.976 [0.358, 1.83] | 1.14 [0.358, 2.38] |
| Missing | 56 (2.2%) | 11 (5.8%) | 67 (2.4%) |
| wt | |||
| Mean (SD) | 74.1 (15.3) | 62.4 (13.3) | 73.3 (15.4) |
| Median [Min, Max] | 73.0 [36.0, 149] | 60.5 [35.0, 101] | 72.0 [35.0, 149] |
| Missing | 19 (0.7%) | 13 (6.9%) | 32 (1.1%) |
| ht | |||
| Mean (SD) | 166 (9.23) | 161 (9.72) | 165 (9.34) |
| Median [Min, Max] | 165 [143, 192] | 160 [136, 190] | 165 [136, 192] |
| Missing | 21 (0.8%) | 13 (6.9%) | 34 (1.2%) |
| bmi | |||
| Mean (SD) | 26.9 (4.73) | 23.9 (3.86) | 26.7 (4.74) |
| Median [Min, Max] | 26.0 [15.0, 57.0] | 24.0 [15.0, 36.0] | 26.0 [15.0, 57.0] |
| Missing | 21 (0.8%) | 13 (6.9%) | 34 (1.2%) |
# Hiển thị dữ liệu với GGally
dat = os[, c("gender", "age", "v1", "v2", "v3", "v4", "wt", "ht", "bmi", "hipfx")]
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(dat, mapping = aes(color = hipfx))
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 65 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 65 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 32 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 67 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 99 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 73 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## Warning: Removed 65 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 65 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 97 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 71 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## Warning: Removed 65 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing missing values (geom_point).
## Warning: Removed 65 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 97 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 71 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 72 rows containing missing values
## Warning: Removed 65 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## Warning: Removed 67 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 97 rows containing missing values (geom_point).
## Warning: Removed 97 rows containing missing values (geom_point).
## Warning: Removed 67 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 73 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 74 rows containing missing values
## Warning: Removed 67 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## Warning: Removed 32 rows containing missing values (geom_point).
## Warning: Removed 73 rows containing missing values (geom_point).
## Warning: Removed 71 rows containing missing values (geom_point).
## Warning: Removed 71 rows containing missing values (geom_point).
## Warning: Removed 73 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 34 rows containing missing values
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 72 rows containing missing values (geom_point).
## Warning: Removed 74 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing missing values (geom_point).
## Warning: Removed 34 rows containing non-finite values (stat_density).
## Warning: Removed 34 rows containing non-finite values (stat_boxplot).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 65 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 67 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 34 rows containing non-finite values (stat_bin).
# Khác biệt về odds gãy xương giữa nam và nữ?
m = glm(hipfx ~ gender, family=binomial, data=os)
summary(m)
##
## Call:
## glm(formula = hipfx ~ gender, family = binomial, data = os)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4237 -0.4237 -0.4237 -0.2910 2.5232
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.36536 0.08777 -26.949 < 2e-16 ***
## genderMale -0.77567 0.17289 -4.486 7.24e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1382.2 on 2787 degrees of freedom
## Residual deviance: 1360.0 on 2786 degrees of freedom
## AIC: 1364
##
## Number of Fisher Scoring iterations: 5
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(m)
##
## Logistic regression predicting hipfx : 1 vs 0
##
## OR(95%CI) P(Wald's test) P(LR-test)
## gender: Male vs Female 0.46 (0.33,0.65) < 0.001 < 0.001
##
## Log-likelihood = -679.98
## No. of observations = 2788
## AIC value = 1363.96
# Mối liên quan giữa v3 và hipfx
m = glm(hipfx ~ gender + v3, family=binomial, data=os)
logistic.display(m)
##
## Logistic regression predicting hipfx : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## gender: Male vs Female 0.49 (0.35,0.7) 1.31 (0.89,1.94) 0.169
##
## v3 (cont. var.) 0 (0,0) 0 (0,0) < 0.001
##
## P(LR-test)
## gender: Male vs Female 0.173
##
## v3 (cont. var.) < 0.001
##
## Log-likelihood = -517.5989
## No. of observations = 2723
## AIC value = 1041.1979
# Tại sao OR = 0? Vì OR quá nhỏ nên làm tròn bằng 0 (thực ra exp(-9.8765) = 0.00005136775).
# Tính OR cho mỗi đơn vị thay đổi của standard deviation (sd)
sd(na.omit(os$v3))
## [1] 0.1546085
os$zv3 = os$v3 / sd(na.omit(os$v3))
m = glm(hipfx ~ gender + zv3, family=binomial, data=os)
logistic.display(m)
##
## Logistic regression predicting hipfx : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## gender: Male vs Female 0.49 (0.35,0.7) 1.31 (0.89,1.94) 0.169
##
## zv3 (cont. var.) 0.23 (0.18,0.28) 0.22 (0.17,0.27) < 0.001
##
## P(LR-test)
## gender: Male vs Female 0.173
##
## zv3 (cont. var.) < 0.001
##
## Log-likelihood = -517.5989
## No. of observations = 2723
## AIC value = 1041.1979
Odds gãy xương ở nam giới bằng 0.46 (KTC 95% 0.33 - 0.65) odds gãy xương ở nữ giới. So với nữ giới, odds gãy xương ở nam giới thấp hơn 54% (KTC 95% 67% - 35%).
# Định nghĩa biến x và y
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
xvars = os[, c("gender", "age", "wt", "ht", "bmi", "v1", "v2", "v3", "v4")]
yvar = os$hipfx
# Tìm biến liên quan:
m = bic.glm(xvars, yvar, strict=F, OR=20, glm.family="binomial")
## Warning in bic.glm.data.frame(xvars, yvar, strict = F, OR = 20, glm.family
## = "binomial"): There were 106 records deleted due to NA's
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = F, OR = 20)
##
##
## 5 models were selected
## Best 5 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -0.2193903 1.369781 -1.060e-01 -2.015e-01
## gender.x 0.0
## .Male 0.0000000 0.000000 . .
## age.x 100.0 0.0555837 0.011979 5.535e-02 5.718e-02
## wt.x 0.0 0.0000000 0.000000 . .
## ht.x 4.8 0.0008384 0.004426 . .
## bmi.x 6.4 -0.0025378 0.011323 . .
## v1.x 11.1 -0.2275109 0.728763 . -2.059e+00
## v2.x 6.5 -0.1607453 0.711598 . .
## v3.x 100.0 -7.9911353 1.178497 -8.357e+00 -6.505e+00
## v4.x 0.0 0.0000000 0.000000 . .
##
## nVar 2 3
## BIC -2.015e+04 -2.015e+04
## post prob 0.712 0.111
## model 3 model 4 model 5
## Intercept -2.962e-01 4.742e-01 -2.750e+00
## gender.x
## .Male . . .
## age.x 5.399e-02 5.519e-02 5.805e-02
## wt.x . . .
## ht.x . . 1.737e-02
## bmi.x . -3.964e-02 .
## v1.x . . .
## v2.x -2.482e+00 . .
## v3.x -6.050e+00 -7.794e+00 -8.860e+00
## v4.x . . .
##
## nVar 3 3 3
## BIC -2.015e+04 -2.015e+04 -2.015e+04
## post prob 0.065 0.064 0.048
# Biểu đồ trình bày các mô hình có khả năng khi tiên lượng hipfx
imageplot.bma(m)
# Đánh giá tầm quan trọng
m = glm(hipfx ~ age + v3, family=binomial, data=os)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
varImp(m, scale=F)
## Overall
## age 4.593955
## v3 11.265944