100126 - PTDL với R

#Ngày 4: Hồi quy Logistic ## Việc 1. Đọc dữ liệu

df = read.csv("/Users/dr.ndnchuong/Desktop/Helix-Innovations/CME - Đào tạo Y khoa liên tục/2026 - CME/[01:2026] Học ngắn hạn - Sử dụng AI trong phân tích dữ liệu - GS Nguyễn Văn Tuấn/DỮ LIỆU THỰC HÀNH (Gửi học viên)/Bone data.csv")

###Việc 2. Mối liên quan giữa hút thuốc lá và gãy xương ###2.1. Mô hình hồi quy logistic #### Gói lệnh cơ bản

library(lessR)

## Warning: package 'lessR' was built under R version 4.5.2

## 
## lessR 4.5                            feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is the default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including modern time series forecasting
##   and many, new Plotly interactive visualizations output. Most
##   visualization functions are now reorganized to three functions:
##      Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
##      X(): type="histogram", "density", "vbs" and more
##      XY(): type="scatter" for a scatterplot, or "contour", "smooth"
##    Most previous function calls still work, such as:
##      BarChart(), Histogram, and Plot().
##   Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
## 
## Interactive data analysis for constructing visualizations.
##   Enter: interact()

model <- glm(fx ~ smoking, data = df, family = binomial)
summary(model)

## 
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4

Gói lệnh lessR

library(lessR)
Logit(fx ~ smoking, data = df, brief = TRUE)

## 
## Response Variable:   fx
## Predictor Variable 1:  smoking
## 
## Number of cases (rows) of data:  2162 
## Number of cases retained for analysis:  2162 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of fx for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -1.0301     0.0644  -15.990    0.000     -1.1563     -0.9038 
##     smoking   -0.1380     0.1008   -1.368    0.171     -0.3355      0.0596 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.3570      0.3146      0.4050 
##     smoking      0.8711      0.7149      1.0614 
## 
## 
## -- Model Fit
## 
##     Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
## 
## AIC: 2443.494 
## 
## Number of iterations to convergence: 4

library(lessR)
model.lessR <- Logit(fx ~ smoking, data = df, brief = FALSE)

## 
## Response Variable:   fx
## Predictor Variable 1:  smoking
## 
## Number of cases (rows) of data:  2162 
## Number of cases retained for analysis:  2162 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of fx for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -1.0301     0.0644  -15.990    0.000     -1.1563     -0.9038 
##     smoking   -0.1380     0.1008   -1.368    0.171     -0.3355      0.0596 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.3570      0.3146      0.4050 
##     smoking      0.8711      0.7149      1.0614 
## 
## 
## -- Model Fit
## 
##     Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
## 
## AIC: 2443.494 
## 
## Number of iterations to convergence: 4 
## 
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 2162 cases (rows) of data]
## --------------------------------------------------------------------
##     smoking fx P(Y=1) residual rstudent  dffits    cooks
## 36        1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 39        1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 55        1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 56        1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 63        1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 65        1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 120       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 129       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 133       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 135       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 141       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 146       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 150       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 163       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 169       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 173       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 181       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 182       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 197       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 203       1  1 0.2372   0.7628    1.697 0.05273 0.001753
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##   smoking fx label fitted std.err
## 1       1  0     0 0.2372 0.01403
## 4       1  0     0 0.2372 0.01403
## 5       1  0     0 0.2372 0.01403
## 7       1  0     0 0.2372 0.01403
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##      smoking fx label fitted std.err
## 2148       1  0     0 0.2372 0.01403
## 2150       1  1     0 0.2372 0.01403
## 2          0  0     0 0.2631 0.01249
## 3          0  0     0 0.2631 0.01249
## 6          0  0     0 0.2631 0.01249
## 
## ... for the last 4 rows of sorted data ...
## 
##      smoking fx label fitted std.err
## 2159       0  0     0 0.2631 0.01249
## 2160       0  1     0 0.2631 0.01249
## 2161       0  0     0 0.2631 0.01249
## 2162       0  1     0 0.2631 0.01249
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## Corresponding cutoff threshold for smoking: -7.467
## 
##            Baseline         Predicted 
## ---------------------------------------------------
##           Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##      1      545  25.2      545      0     0.0 
## fx   0     1617  74.8     1617      0     100.0 
## ---------------------------------------------------
##    Total   2162                           74.8 
## 
## Accuracy: 74.79 
## Sensitivity: 0.00 
## Precision: NaN

###Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương ####3.1 Mô hình hồi qui logistic

model.2 <- glm(fx ~ smoking + sex, data = df, family = binomial)
summary(model.2)

## 
## Call:
## glm(formula = fx ~ smoking + sex, family = binomial, data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.85598    0.06835 -12.523  < 2e-16 ***
## smoking      0.09872    0.10724   0.921    0.357    
## sexMale     -0.78880    0.11493  -6.863 6.73e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2389.6  on 2159  degrees of freedom
## AIC: 2395.6
## 
## Number of Fisher Scoring iterations: 4

model2.lessR = Logit(fx ~ smoking + sex, data = df)

## 
## >>> Note:  sex is not a numeric variable.
##            Indicator variables are created and analyzed.
## 
## Response Variable:   fx
## Predictor Variable 1:  smoking
## Predictor Variable 2:  sexMale
## 
## Number of cases (rows) of data:  2162 
## Number of cases retained for analysis:  2162 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of fx for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -0.8560     0.0684  -12.523    0.000     -0.9900     -0.7220 
##     smoking    0.0987     0.1072    0.921    0.357     -0.1115      0.3089 
##     sexMale   -0.7888     0.1149   -6.863    0.000     -1.0141     -0.5635 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.4249      0.3716      0.4858 
##     smoking      1.1038      0.8945      1.3619 
##     sexMale      0.4544      0.3627      0.5692 
## 
## 
## -- Model Fit
## 
##     Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2389.599 on 2159 degrees of freedom
## 
## AIC: 2395.599 
## 
## Number of iterations to convergence: 4 
## 
## 
## Collinearity
## 
##         Tolerance       VIF
## smoking     0.899     1.112
## sexMale     0.899     1.112
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 2162 cases (rows) of data]
## --------------------------------------------------------------------
##     smoking sexMale fx P(Y=1) residual rstudent  dffits    cooks
## 18        0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 183       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 212       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 217       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 221       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 263       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 308       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 323       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 491       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 545       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 561       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 563       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 628       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 680       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 744       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 762       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 770       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 812       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 821       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 951       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##    smoking sexMale fx label fitted std.err
## 3        0       1  0     0 0.1618 0.01548
## 14       0       1  0     0 0.1618 0.01548
## 18       0       1  1     0 0.1618 0.01548
## 24       0       1  0     0 0.1618 0.01548
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##      smoking sexMale fx label fitted std.err
## 2160       0       0  1     0 0.2982 0.01430
## 2162       0       0  1     0 0.2982 0.01430
## 4          1       0  0     0 0.3192 0.02074
## 20         1       0  0     0 0.3192 0.02074
## 32         1       0  0     0 0.3192 0.02074
## 
## ... for the last 4 rows of sorted data ...
## 
##      smoking sexMale fx label fitted std.err
## 2146       1       0  1     0 0.3192 0.02074
## 2147       1       0  0     0 0.3192 0.02074
## 2148       1       0  0     0 0.3192 0.02074
## 2150       1       0  1     0 0.3192 0.02074
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## 
##            Baseline         Predicted 
## ---------------------------------------------------
##           Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##      1      545  25.2      545      0     0.0 
## fx   0     1617  74.8     1617      0     100.0 
## ---------------------------------------------------
##    Total   2162                           74.8 
## 
## Accuracy: 74.79 
## Sensitivity: 0.00 
## Precision: NaN

Giai thich: Trong cung dieu kien hut thuoc nhu nhau, odds gay xuong cua nam thap hon 55% so voi nu gioi, khoang tin cay 95% trong khoang 43% den 64% (P<.0001)

Tim mo hinh toi uu bang PP BMA

library(BMA)

## Loading required package: survival

## Loading required package: leaps

## Loading required package: robustbase

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## Loading required package: inline

## Loading required package: rrcov

## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)

df_bma <- na.omit(df[, c("fx", "age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")])
dim (df_bma)

## [1] 2121    8

bma_model <- bic.glm(
  x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")],
  y = df_bma$fx,
  glm.family = binomial()
)
summary(bma_model)

## 
## Call:
## bic.glm.data.frame(x = df_bma[, c("age", "sex", "weight", "height",     "fnbmd", "smoking", "prior.fx")], y = df_bma$fx, glm.family = binomial())
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100     2.249962  0.674258   2.509e+00   2.340e+00   1.140e+00
## age         16.2   0.002663  0.006838       .           .       1.602e-02
## sex         18.2  -0.048048  0.114551       .      -2.578e-01       .    
## weight       0.0   0.000000  0.000000       .           .           .    
## height       0.0   0.000000  0.000000       .           .           .    
## fnbmd      100.0  -4.487636  0.436674  -4.595e+00  -4.279e+00  -4.302e+00
## smoking      0.0   0.000000  0.000000       .           .           .    
## prior.fx   100.0   0.530729  0.134289   5.306e-01   5.445e-01   5.156e-01
##                                                                          
## nVar                                      2           3           3      
## BIC                                    -1.402e+04  -1.402e+04  -1.402e+04
## post prob                               0.696       0.142       0.122    
##            model 4   
## Intercept   7.978e-01
## age         1.780e-02
## sex        -2.851e-01
## weight          .    
## height          .    
## fnbmd      -3.918e+00
## smoking         .    
## prior.fx    5.296e-01
##                      
## nVar          4      
## BIC        -1.402e+04
## post prob   0.040

df_bma$fnbmd.sd = df_bma$fnbmd/0.15
bma_best = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)
exp(cbind(Odds_Ratio = coef(bma_best), confint(bma_best)))

## Waiting for profiling to be done...

##             Odds_Ratio     2.5 %    97.5 %
## (Intercept) 12.2977330 6.6858383 22.888616
## fnbmd.sd     0.5019265 0.4467839  0.562099
## prior.fx     1.6999821 1.3051619  2.207473

###ROC

m1 = glm(fx ~fnbmd.sd + prior.fx, family = binomial, data = df_bma)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

pred = predict(m1, type = "response")
roc_curve = roc(df_bma$fx ~ pred)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc(roc_curve)

## Area under the curve: 0.7043

ci(roc_curve)

## 95% CI: 0.6794-0.7292 (DeLong)

plot(roc_curve)

###Calibration

library(rms)

## Loading required package: Hmisc

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:lessR':
## 
##     label, Merge

## The following objects are masked from 'package:base':
## 
##     format.pval, units

fit = lrm(fx ~ fnbmd.sd + prior.fx, data = df_bma)
pred2 = predict(fit)
phat = 1/(1 + exp(-pred2))
val.prob(phat,df_bma$fx, m = 20, cex = 0.5)

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  4.085621e-01  7.042810e-01  1.312413e-01  9.255161e-02  1.973020e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##  0.000000e+00 -9.429514e-04 -4.547474e-13  1.000000e+00  9.349456e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  1.716198e-01  3.121412e-15  1.000000e+00  2.512248e-01  3.359625e-02 
##          Eavg           S:z           S:p 
##  1.689702e-02  1.321099e-01  8.948974e-01

###Optional: đánh giá tầm quan trọng biến số (relative importance)

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

df_bma$fx1 = as.factor (df_bma$fx)
m.rf = randomForest(fx1 ~ age + sex + weight + height + fnbmd + smoking + prior.fx, data = df_bma, importance = TRUE, ntrees = 1000)
importance <- importance(m.rf, type=1)
importance

##          MeanDecreaseAccuracy
## age                10.6987009
## sex                21.1423058
## weight             12.7431693
## height             23.4879275
## fnbmd              31.9355560
## smoking             0.4762935
## prior.fx           16.7340654

varImpPlot(m.rf, type = 1, main = "Relative Importance")

100126 - PTDL với R - VPN

Chuong, Nguyen-Dinh-nguyen

2026-01-10

Gói lệnh lessR

Tim mo hinh toi uu bang PP BMA