df=read.csv("/Users/lynguyen/Documents/6 Dao tao/Tap_huan/R 2025/Tai_lieu_R/Bone data.csv")
dim (df)
## [1] 2162    9
head(df)
##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0

Việc 2. Mối liên quan giữa hút thuôc và nguy cơ gẫy xương

2.1 Mô hình hồi quy logistic đánh giá mối liên quan giữa hút thuốc và nguy cơ gẫy ương

Gối lệnh cơ bản

model <- glm(fx ~ smoking, data = df, family = binomial)
summary(model)
## 
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
library(lessR)
## 
## lessR 4.5                            feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is the default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including modern time series forecasting
##   and many, new Plotly interactive visualizations output. Most
##   visualization functions are now reorganized to three functions:
##      Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
##      X(): type="histogram", "density", "vbs" and more
##      XY(): type="scatter" for a scatterplot, or "contour", "smooth"
##    Most previous function calls still work, such as:
##      BarChart(), Histogram, and Plot().
##   Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
## 
## Interactive data analysis for constructing visualizations.
##   Enter: interact()
model.lessR<-Logit(fx~smoking, data=df, brief=TRUE)
## 
## Response Variable:   fx
## Predictor Variable 1:  smoking
## 
## Number of cases (rows) of data:  2162 
## Number of cases retained for analysis:  2162 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of fx for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -1.0301     0.0644  -15.990    0.000     -1.1563     -0.9038 
##     smoking   -0.1380     0.1008   -1.368    0.171     -0.3355      0.0596 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.3570      0.3146      0.4050 
##     smoking      0.8711      0.7149      1.0614 
## 
## 
## -- Model Fit
## 
##     Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
## 
## AIC: 2443.494 
## 
## Number of iterations to convergence: 4
library(epiDisplay)
logistic.display(model)
## 
## Logistic regression predicting fx 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  0.171          0.17      
##                                                             
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938
summary(model.lessR)
## 
## Call:
## glm(formula = my_formula, family = "binomial", data = data)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4

##Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương 3.1 Mô hình hồi qui logistic

library(lessR)
model.2 <- glm(fx ~ smoking + sex, data = df, family = binomial)
logistic.display(model)
## 
## Logistic regression predicting fx 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  0.171          0.17      
##                                                             
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938
summary(model.2)
## 
## Call:
## glm(formula = fx ~ smoking + sex, family = binomial, data = df)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.85598    0.06835 -12.523  < 2e-16 ***
## smoking      0.09872    0.10724   0.921    0.357    
## sexMale     -0.78880    0.11493  -6.863 6.73e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2389.6  on 2159  degrees of freedom
## AIC: 2395.6
## 
## Number of Fisher Scoring iterations: 4
exp(cbind(Odds_Ratio = coef(model.2), confint(model.2)))
## Waiting for profiling to be done...
##             Odds_Ratio     2.5 %    97.5 %
## (Intercept)  0.4248646 0.3711574 0.4852461
## smoking      1.1037584 0.8942327 1.3617209
## sexMale      0.4543898 0.3619039 0.5680105
model2.lessR = Logit(fx ~ smoking + sex, data = df)
## 
## >>> Note:  sex is not a numeric variable.
##            Indicator variables are created and analyzed.
## 
## Response Variable:   fx
## Predictor Variable 1:  smoking
## Predictor Variable 2:  sexMale
## 
## Number of cases (rows) of data:  2162 
## Number of cases retained for analysis:  2162 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of fx for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -0.8560     0.0684  -12.523    0.000     -0.9900     -0.7220 
##     smoking    0.0987     0.1072    0.921    0.357     -0.1115      0.3089 
##     sexMale   -0.7888     0.1149   -6.863    0.000     -1.0141     -0.5635 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.4249      0.3716      0.4858 
##     smoking      1.1038      0.8945      1.3619 
##     sexMale      0.4544      0.3627      0.5692 
## 
## 
## -- Model Fit
## 
##     Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2389.599 on 2159 degrees of freedom
## 
## AIC: 2395.599 
## 
## Number of iterations to convergence: 4 
## 
## 
## Collinearity
## 
##         Tolerance       VIF
## smoking     0.899     1.112
## sexMale     0.899     1.112
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 2162 cases (rows) of data]
## --------------------------------------------------------------------
##     smoking sexMale fx P(Y=1) residual rstudent  dffits    cooks
## 18        0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 183       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 212       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 217       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 221       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 263       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 308       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 323       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 491       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 545       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 561       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 563       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 628       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 680       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 744       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 762       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 770       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 812       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 821       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 951       0       1  1 0.1618   0.8382    1.911 0.07645 0.003063
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##    smoking sexMale fx label fitted std.err
## 3        0       1  0     0 0.1618 0.01548
## 14       0       1  0     0 0.1618 0.01548
## 18       0       1  1     0 0.1618 0.01548
## 24       0       1  0     0 0.1618 0.01548
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##      smoking sexMale fx label fitted std.err
## 2160       0       0  1     0 0.2982 0.01430
## 2162       0       0  1     0 0.2982 0.01430
## 4          1       0  0     0 0.3192 0.02074
## 20         1       0  0     0 0.3192 0.02074
## 32         1       0  0     0 0.3192 0.02074
## 
## ... for the last 4 rows of sorted data ...
## 
##      smoking sexMale fx label fitted std.err
## 2146       1       0  1     0 0.3192 0.02074
## 2147       1       0  0     0 0.3192 0.02074
## 2148       1       0  0     0 0.3192 0.02074
## 2150       1       0  1     0 0.3192 0.02074
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## 
##            Baseline         Predicted 
## ---------------------------------------------------
##           Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##      1      545  25.2      545      0     0.0 
## fx   0     1617  74.8     1617      0     100.0 
## ---------------------------------------------------
##    Total   2162                           74.8 
## 
## Accuracy: 74.79 
## Sensitivity: 0.00 
## Precision: NaN

##Việc 4. Xây dựng mô hình dự báo nguy cơ gãy xương 4.1 Tìm mô hình tối ưu bằng pp BMA

library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
df_bma <- na.omit(df[, c("fx", "age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")]) ## bỏ giá trị trống na
dim(df_bma)
## [1] 2121    8
bma_model <- bic.glm(
  x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")],
  y = df_bma$fx,
  glm.family = binomial()
)
summary(bma_model)
## 
## Call:
## bic.glm.data.frame(x = df_bma[, c("age", "sex", "weight", "height",     "fnbmd", "smoking", "prior.fx")], y = df_bma$fx, glm.family = binomial())
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100     2.249962  0.674258   2.509e+00   2.340e+00   1.140e+00
## age         16.2   0.002663  0.006838       .           .       1.602e-02
## sex         18.2  -0.048048  0.114551       .      -2.578e-01       .    
## weight       0.0   0.000000  0.000000       .           .           .    
## height       0.0   0.000000  0.000000       .           .           .    
## fnbmd      100.0  -4.487636  0.436674  -4.595e+00  -4.279e+00  -4.302e+00
## smoking      0.0   0.000000  0.000000       .           .           .    
## prior.fx   100.0   0.530729  0.134289   5.306e-01   5.445e-01   5.156e-01
##                                                                          
## nVar                                      2           3           3      
## BIC                                    -1.402e+04  -1.402e+04  -1.402e+04
## post prob                               0.696       0.142       0.122    
##            model 4   
## Intercept   7.978e-01
## age         1.780e-02
## sex        -2.851e-01
## weight          .    
## height          .    
## fnbmd      -3.918e+00
## smoking         .    
## prior.fx    5.296e-01
##                      
## nVar          4      
## BIC        -1.402e+04
## post prob   0.040
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

##4.2 Mô hình tối ưu

df_bma$fnbmd.sd = df_bma$fnbmd/0.15
bma_best = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)
exp(cbind(Odds_Ratio = coef(bma_best), confint(bma_best)))
## Waiting for profiling to be done...
##             Odds_Ratio     2.5 %    97.5 %
## (Intercept) 12.2977330 6.6858383 22.888616
## fnbmd.sd     0.5019265 0.4467839  0.562099
## prior.fx     1.6999821 1.3051619  2.207473
m1 = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)

library(pROC)

pred = predict(m1, type = "response")
roc_curve = roc(df_bma$fx ~ pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.7043
ci(roc_curve)
## 95% CI: 0.6794-0.7292 (DeLong)
plot(roc_curve)

### Đánh giá tầm quan trọng của biến số

## Optional: danh gia tam quan trong bien so (relative importance)

library(randomForest)## gói lựa chọn tầm quan trọng của biến số
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
df_bma$fx1 = as.factor(df_bma$fx)

m.rf = randomForest(
  fx1 ~ age + sex + weight + height + fnbmd + smoking + prior.fx,
  data = df_bma,
  importance = TRUE,
  ntree = 1000
)

importance <- importance(m.rf, type = 1)
importance
##          MeanDecreaseAccuracy
## age                 15.487436
## sex                 27.291154
## weight              17.011171
## height              31.310246
## fnbmd               40.775394
## smoking             -2.421246
## prior.fx            23.559115
varImpPlot(m.rf, type = 1, main = "Relative Importance")