DAY 6- BAI TAP-15052025

BÀI TẬP DAY 6-15/05/2025 Ngày 6: Hồi qui logistic

##Việc 1. Đọc dữ liệu ‘Bone data’:file.choose() “D:\R\DU LIEU THUC HANH TS ThACH GUI\Bone data.csv”

os = read.csv("D:\\R\\DU LIEU THUC HANH TS ThACH GUI\\Bone data.csv")
head(os)

##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0

##Việc 2. Đánh giá mối liên quan giữa hút thuốc và nguy cơ gãy xương

m.1 = glm(fx ~ smoking, family = binomial, data = os)
summary(m.1)

## 
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = os)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4

library(epiDisplay)

## Loading required package: foreign

## Loading required package: survival

## Loading required package: MASS

## Loading required package: nnet

logistic.display(m.1)

## 
## Logistic regression predicting fx 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  0.171          0.17      
##                                                             
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938

###GPT

logistic_model <- glm(fx ~ smoking, data = os, family = binomial)
summary(logistic_model)

## 
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = os)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4

##Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương. Giới tính là yếu tố gây nhiễu

library(compareGroups)
createTable(compareGroups(sex ~ smoking + fx, data = os))

## 
## --------Summary descriptives table by 'sex'---------
## 
## _________________________________________ 
##           Female       Male     p.overall 
##           N=1317       N=845              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## smoking 0.30 (0.46) 0.62 (0.49)  <0.001   
## fx      0.30 (0.46) 0.17 (0.38)  <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

###[ChatGPT] #PROMPT: “Y văn ghi nhận giới tính (sex) là yếu tố gây nhiễu. Bạn thực hiện mô hinh hồi qui logistic đa biến đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương”

logistic_model_multivariate <- glm(fx ~ smoking + sex, data = os, family = binomial)
summary(logistic_model_multivariate)

## 
## Call:
## glm(formula = fx ~ smoking + sex, family = binomial, data = os)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.85598    0.06835 -12.523  < 2e-16 ***
## smoking      0.09872    0.10724   0.921    0.357    
## sexMale     -0.78880    0.11493  -6.863 6.73e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2389.6  on 2159  degrees of freedom
## AIC: 2395.6
## 
## Number of Fisher Scoring iterations: 4

##Việc 4. Đánh giá tầm quan trọng của biến số

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:epiDisplay':
## 
##     alpha

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:epiDisplay':
## 
##     dotplot

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

m.rel <- glm(fx ~ sex + age + fnbmd + weight, family = binomial, data = os)
varImp(m.rel, scale = FALSE)

##           Overall
## sexMale 2.3022178
## age     2.5112611
## fnbmd   8.7193666
## weight  0.8239425

###GPT- Làm tiếp sau ##Việc 5. Xây dựng mô hình dự báo nguy cơ gãy xương ###5.1 Tìm mô hình tối ưu bằng pp BMA

library(BMA)

## Loading required package: leaps

## Loading required package: robustbase

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## Loading required package: inline

## Loading required package: rrcov

## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)

os.2 = na.omit(os)
xvars = os.2[, c("sex", "age", "weight", "height", "fnbmd", "smoking", "prior.fx")]
yvar = os.2[, c("fx")]
m.bma = bic.glm(xvars, yvar, strict = FALSE, OR = 20, glm.family = "binomial")
summary(m.bma)

## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = FALSE, OR = 20)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100     2.249962  0.674258   2.509e+00   2.340e+00   1.140e+00
## sex         18.2  -0.048048  0.114551       .      -2.578e-01       .    
## age         16.2   0.002663  0.006838       .           .       1.602e-02
## weight       0.0   0.000000  0.000000       .           .           .    
## height       0.0   0.000000  0.000000       .           .           .    
## fnbmd      100.0  -4.487636  0.436674  -4.595e+00  -4.279e+00  -4.302e+00
## smoking      0.0   0.000000  0.000000       .           .           .    
## prior.fx   100.0   0.530729  0.134289   5.306e-01   5.445e-01   5.156e-01
##                                                                          
## nVar                                      2           3           3      
## BIC                                    -1.402e+04  -1.402e+04  -1.402e+04
## post prob                               0.696       0.142       0.122    
##            model 4   
## Intercept   7.978e-01
## sex        -2.851e-01
## age         1.780e-02
## weight          .    
## height          .    
## fnbmd      -3.918e+00
## smoking         .    
## prior.fx    5.296e-01
##                      
## nVar          4      
## BIC        -1.402e+04
## post prob   0.040

Chart

imageplot.bma(m.bma)

###5.2 Mô hình tối ưu

imageplot.bma(m.bma)

os.2$fnbmd.sd = os.2$fnbmd/0.15
m.3 = glm(fx ~ fnbmd.sd, family = binomial, data = os.2)
logistic.display(m.3)

## 
## Logistic regression predicting fx 
##  
##                       OR(95%CI)         P(Wald's test) P(LR-test)
## fnbmd.sd (cont. var.) 0.49 (0.43,0.54)  < 0.001        < 0.001   
##                                                                  
## Log-likelihood = -1106.8757
## No. of observations = 2121
## AIC value = 2217.7514

####GPT:PROMPT: “Xây dựng mô hình tối ưu dự báo gãy xương từ những biến số như tuổi (age), giới tính (sex), cân nặng (weight), chiều cao (height), mật độ xương (fnbmd), hút thuốc là (smoking) và tiền căn gãy xương (prior.fx). Dùng phương pháp Bayesian Model Averaging từ gói lệnh BMA”

logistic_bma_model <- bic.glm(fx ~ age + sex + weight + height + fnbmd + smoking + prior.fx,data = os, glm.family = "binomial")
summary(logistic_bma_model)

## 
## Call:
## bic.glm.formula(f = fx ~ age + sex + weight + height + fnbmd +     smoking + prior.fx, data = os, glm.family = "binomial")
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100     2.249962  0.674258   2.509e+00   2.340e+00   1.140e+00
## age         16.2   0.002663  0.006838       .           .       1.602e-02
## sexMale     18.2  -0.048048  0.114551       .      -2.578e-01       .    
## weight       0.0   0.000000  0.000000       .           .           .    
## height       0.0   0.000000  0.000000       .           .           .    
## fnbmd      100.0  -4.487636  0.436674  -4.595e+00  -4.279e+00  -4.302e+00
## smoking      0.0   0.000000  0.000000       .           .           .    
## prior.fx   100.0   0.530729  0.134289   5.306e-01   5.445e-01   5.156e-01
##                                                                          
## nVar                                      2           3           3      
## BIC                                    -1.402e+04  -1.402e+04  -1.402e+04
## post prob                               0.696       0.142       0.122    
##            model 4   
## Intercept   7.978e-01
## age         1.780e-02
## sexMale    -2.851e-01
## weight          .    
## height          .    
## fnbmd      -3.918e+00
## smoking         .    
## prior.fx    5.296e-01
##                      
## nVar          4      
## BIC        -1.402e+04
## post prob   0.040    
## 
##   41  observations deleted due to missingness.

##Việc 6. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs https://rpubs.com/PHUC/1310628

DAY 6- BAI TAP-15052025

NGUYEN VAN PHUC

2025-05-15

BÀI TẬP DAY 6-15/05/2025 Ngày 6: Hồi qui logistic

Chart