#Ngày 4: Hồi quy Logistic ## Việc 1. Đọc dữ liệu
df = read.csv("/Users/dr.ndnchuong/Desktop/Helix-Innovations/CME - Đào tạo Y khoa liên tục/2026 - CME/[01:2026] Học ngắn hạn - Sử dụng AI trong phân tích dữ liệu - GS Nguyễn Văn Tuấn/DỮ LIỆU THỰC HÀNH (Gửi học viên)/Bone data.csv")
###Việc 2. Mối liên quan giữa hút thuốc lá và gãy xương ###2.1. Mô hình hồi quy logistic #### Gói lệnh cơ bản
library(lessR)
## Warning: package 'lessR' was built under R version 4.5.2
##
## lessR 4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including modern time series forecasting
## and many, new Plotly interactive visualizations output. Most
## visualization functions are now reorganized to three functions:
## Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
## X(): type="histogram", "density", "vbs" and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## Most previous function calls still work, such as:
## BarChart(), Histogram, and Plot().
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
model <- glm(fx ~ smoking, data = df, family = binomial)
summary(model)
##
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.03006 0.06442 -15.990 <2e-16 ***
## smoking -0.13796 0.10081 -1.368 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2439.5 on 2160 degrees of freedom
## AIC: 2443.5
##
## Number of Fisher Scoring iterations: 4
library(lessR)
Logit(fx ~ smoking, data = df, brief = TRUE)
##
## Response Variable: fx
## Predictor Variable 1: smoking
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -1.0301 0.0644 -15.990 0.000 -1.1563 -0.9038
## smoking -0.1380 0.1008 -1.368 0.171 -0.3355 0.0596
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.3570 0.3146 0.4050
## smoking 0.8711 0.7149 1.0614
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
##
## AIC: 2443.494
##
## Number of iterations to convergence: 4
library(lessR)
model.lessR <- Logit(fx ~ smoking, data = df, brief = FALSE)
##
## Response Variable: fx
## Predictor Variable 1: smoking
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -1.0301 0.0644 -15.990 0.000 -1.1563 -0.9038
## smoking -0.1380 0.1008 -1.368 0.171 -0.3355 0.0596
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.3570 0.3146 0.4050
## smoking 0.8711 0.7149 1.0614
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
##
## AIC: 2443.494
##
## Number of iterations to convergence: 4
##
##
## ANALYSIS OF RESIDUALS AND INFLUENCE
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [res_rows = 20 out of 2162 cases (rows) of data]
## --------------------------------------------------------------------
## smoking fx P(Y=1) residual rstudent dffits cooks
## 36 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 39 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 55 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 56 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 63 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 65 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 120 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 129 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 133 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 135 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 141 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 146 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 150 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 163 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 169 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 173 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 181 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 182 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 197 1 1 0.2372 0.7628 1.697 0.05273 0.001753
## 203 1 1 0.2372 0.7628 1.697 0.05273 0.001753
##
##
## PREDICTION
##
## Probability threshold for classification : 0.5
##
##
## Data, Fitted Values, Standard Errors
## [sorted by fitted value]
## [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
## smoking fx label fitted std.err
## 1 1 0 0 0.2372 0.01403
## 4 1 0 0 0.2372 0.01403
## 5 1 0 0 0.2372 0.01403
## 7 1 0 0 0.2372 0.01403
##
## ... for the rows of data where fitted is close to 0.5 ...
##
## smoking fx label fitted std.err
## 2148 1 0 0 0.2372 0.01403
## 2150 1 1 0 0.2372 0.01403
## 2 0 0 0 0.2631 0.01249
## 3 0 0 0 0.2631 0.01249
## 6 0 0 0 0.2631 0.01249
##
## ... for the last 4 rows of sorted data ...
##
## smoking fx label fitted std.err
## 2159 0 0 0 0.2631 0.01249
## 2160 0 1 0 0.2631 0.01249
## 2161 0 0 0 0.2631 0.01249
## 2162 0 1 0 0.2631 0.01249
## --------------------------------------------------------------------
##
##
## ----------------------------
## Specified confusion matrices
## ----------------------------
##
## Probability threshold for predicting : 0.5
## Corresponding cutoff threshold for smoking: -7.467
##
## Baseline Predicted
## ---------------------------------------------------
## Total %Tot 0 1 %Correct
## ---------------------------------------------------
## 1 545 25.2 545 0 0.0
## fx 0 1617 74.8 1617 0 100.0
## ---------------------------------------------------
## Total 2162 74.8
##
## Accuracy: 74.79
## Sensitivity: 0.00
## Precision: NaN
###Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy
xương ####3.1 Mô hình hồi qui logistic
model.2 <- glm(fx ~ smoking + sex, data = df, family = binomial)
summary(model.2)
##
## Call:
## glm(formula = fx ~ smoking + sex, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.85598 0.06835 -12.523 < 2e-16 ***
## smoking 0.09872 0.10724 0.921 0.357
## sexMale -0.78880 0.11493 -6.863 6.73e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2389.6 on 2159 degrees of freedom
## AIC: 2395.6
##
## Number of Fisher Scoring iterations: 4
model2.lessR = Logit(fx ~ smoking + sex, data = df)
##
## >>> Note: sex is not a numeric variable.
## Indicator variables are created and analyzed.
##
## Response Variable: fx
## Predictor Variable 1: smoking
## Predictor Variable 2: sexMale
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -0.8560 0.0684 -12.523 0.000 -0.9900 -0.7220
## smoking 0.0987 0.1072 0.921 0.357 -0.1115 0.3089
## sexMale -0.7888 0.1149 -6.863 0.000 -1.0141 -0.5635
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.4249 0.3716 0.4858
## smoking 1.1038 0.8945 1.3619
## sexMale 0.4544 0.3627 0.5692
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2389.599 on 2159 degrees of freedom
##
## AIC: 2395.599
##
## Number of iterations to convergence: 4
##
##
## Collinearity
##
## Tolerance VIF
## smoking 0.899 1.112
## sexMale 0.899 1.112
##
## ANALYSIS OF RESIDUALS AND INFLUENCE
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [res_rows = 20 out of 2162 cases (rows) of data]
## --------------------------------------------------------------------
## smoking sexMale fx P(Y=1) residual rstudent dffits cooks
## 18 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 183 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 212 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 217 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 221 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 263 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 308 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 323 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 491 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 545 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 561 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 563 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 628 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 680 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 744 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 762 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 770 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 812 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 821 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 951 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
##
##
## PREDICTION
##
## Probability threshold for classification : 0.5
##
##
## Data, Fitted Values, Standard Errors
## [sorted by fitted value]
## [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
## smoking sexMale fx label fitted std.err
## 3 0 1 0 0 0.1618 0.01548
## 14 0 1 0 0 0.1618 0.01548
## 18 0 1 1 0 0.1618 0.01548
## 24 0 1 0 0 0.1618 0.01548
##
## ... for the rows of data where fitted is close to 0.5 ...
##
## smoking sexMale fx label fitted std.err
## 2160 0 0 1 0 0.2982 0.01430
## 2162 0 0 1 0 0.2982 0.01430
## 4 1 0 0 0 0.3192 0.02074
## 20 1 0 0 0 0.3192 0.02074
## 32 1 0 0 0 0.3192 0.02074
##
## ... for the last 4 rows of sorted data ...
##
## smoking sexMale fx label fitted std.err
## 2146 1 0 1 0 0.3192 0.02074
## 2147 1 0 0 0 0.3192 0.02074
## 2148 1 0 0 0 0.3192 0.02074
## 2150 1 0 1 0 0.3192 0.02074
## --------------------------------------------------------------------
##
##
## ----------------------------
## Specified confusion matrices
## ----------------------------
##
## Probability threshold for predicting : 0.5
##
## Baseline Predicted
## ---------------------------------------------------
## Total %Tot 0 1 %Correct
## ---------------------------------------------------
## 1 545 25.2 545 0 0.0
## fx 0 1617 74.8 1617 0 100.0
## ---------------------------------------------------
## Total 2162 74.8
##
## Accuracy: 74.79
## Sensitivity: 0.00
## Precision: NaN
Giai thich: Trong cung dieu kien hut thuoc nhu nhau, odds gay xuong cua
nam thap hon 55% so voi nu gioi, khoang tin cay 95% trong khoang 43% den
64% (P<.0001)
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
df_bma <- na.omit(df[, c("fx", "age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")])
dim (df_bma)
## [1] 2121 8
bma_model <- bic.glm(
x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")],
y = df_bma$fx,
glm.family = binomial()
)
summary(bma_model)
##
## Call:
## bic.glm.data.frame(x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")], y = df_bma$fx, glm.family = binomial())
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 2.249962 0.674258 2.509e+00 2.340e+00 1.140e+00
## age 16.2 0.002663 0.006838 . . 1.602e-02
## sex 18.2 -0.048048 0.114551 . -2.578e-01 .
## weight 0.0 0.000000 0.000000 . . .
## height 0.0 0.000000 0.000000 . . .
## fnbmd 100.0 -4.487636 0.436674 -4.595e+00 -4.279e+00 -4.302e+00
## smoking 0.0 0.000000 0.000000 . . .
## prior.fx 100.0 0.530729 0.134289 5.306e-01 5.445e-01 5.156e-01
##
## nVar 2 3 3
## BIC -1.402e+04 -1.402e+04 -1.402e+04
## post prob 0.696 0.142 0.122
## model 4
## Intercept 7.978e-01
## age 1.780e-02
## sex -2.851e-01
## weight .
## height .
## fnbmd -3.918e+00
## smoking .
## prior.fx 5.296e-01
##
## nVar 4
## BIC -1.402e+04
## post prob 0.040
df_bma$fnbmd.sd = df_bma$fnbmd/0.15
bma_best = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)
exp(cbind(Odds_Ratio = coef(bma_best), confint(bma_best)))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 12.2977330 6.6858383 22.888616
## fnbmd.sd 0.5019265 0.4467839 0.562099
## prior.fx 1.6999821 1.3051619 2.207473
###ROC
m1 = glm(fx ~fnbmd.sd + prior.fx, family = binomial, data = df_bma)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pred = predict(m1, type = "response")
roc_curve = roc(df_bma$fx ~ pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.7043
ci(roc_curve)
## 95% CI: 0.6794-0.7292 (DeLong)
plot(roc_curve)
###Calibration
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:lessR':
##
## label, Merge
## The following objects are masked from 'package:base':
##
## format.pval, units
fit = lrm(fx ~ fnbmd.sd + prior.fx, data = df_bma)
pred2 = predict(fit)
phat = 1/(1 + exp(-pred2))
val.prob(phat,df_bma$fx, m = 20, cex = 0.5)
## Dxy C (ROC) R2 D D:Chi-sq
## 4.085621e-01 7.042810e-01 1.312413e-01 9.255161e-02 1.973020e+02
## D:p U U:Chi-sq U:p Q
## 0.000000e+00 -9.429514e-04 -4.547474e-13 1.000000e+00 9.349456e-02
## Brier Intercept Slope Emax E90
## 1.716198e-01 3.121412e-15 1.000000e+00 2.512248e-01 3.359625e-02
## Eavg S:z S:p
## 1.689702e-02 1.321099e-01 8.948974e-01
###Optional: đánh giá tầm quan trọng biến số (relative importance)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
df_bma$fx1 = as.factor (df_bma$fx)
m.rf = randomForest(fx1 ~ age + sex + weight + height + fnbmd + smoking + prior.fx, data = df_bma, importance = TRUE, ntrees = 1000)
importance <- importance(m.rf, type=1)
importance
## MeanDecreaseAccuracy
## age 10.6987009
## sex 21.1423058
## weight 12.7431693
## height 23.4879275
## fnbmd 31.9355560
## smoking 0.4762935
## prior.fx 16.7340654
varImpPlot(m.rf, type = 1, main = "Relative Importance")