df=read.csv("/Users/lynguyen/Documents/6 Dao tao/Tap_huan/R 2025/Tai_lieu_R/Bone data.csv")
dim (df)
## [1] 2162 9
head(df)
## id sex age weight height prior.fx fnbmd smoking fx
## 1 1 Male 73 98 175 0 1.08 1 0
## 2 2 Female 68 72 166 0 0.97 0 0
## 3 3 Male 68 87 184 0 1.01 0 0
## 4 4 Female 62 72 173 0 0.84 1 0
## 5 5 Male 61 72 173 0 0.81 1 0
## 6 6 Female 76 57 156 0 0.74 0 0
model <- glm(fx ~ smoking, data = df, family = binomial)
summary(model)
##
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.03006 0.06442 -15.990 <2e-16 ***
## smoking -0.13796 0.10081 -1.368 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2439.5 on 2160 degrees of freedom
## AIC: 2443.5
##
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
library(lessR)
##
## lessR 4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including modern time series forecasting
## and many, new Plotly interactive visualizations output. Most
## visualization functions are now reorganized to three functions:
## Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
## X(): type="histogram", "density", "vbs" and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## Most previous function calls still work, such as:
## BarChart(), Histogram, and Plot().
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
model.lessR<-Logit(fx~smoking, data=df, brief=TRUE)
##
## Response Variable: fx
## Predictor Variable 1: smoking
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -1.0301 0.0644 -15.990 0.000 -1.1563 -0.9038
## smoking -0.1380 0.1008 -1.368 0.171 -0.3355 0.0596
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.3570 0.3146 0.4050
## smoking 0.8711 0.7149 1.0614
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
##
## AIC: 2443.494
##
## Number of iterations to convergence: 4
library(epiDisplay)
logistic.display(model)
##
## Logistic regression predicting fx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## smoking: 1 vs 0 0.87 (0.71,1.06) 0.171 0.17
##
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938
summary(model.lessR)
##
## Call:
## glm(formula = my_formula, family = "binomial", data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.03006 0.06442 -15.990 <2e-16 ***
## smoking -0.13796 0.10081 -1.368 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2439.5 on 2160 degrees of freedom
## AIC: 2443.5
##
## Number of Fisher Scoring iterations: 4
##Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương 3.1 Mô hình hồi qui logistic
library(lessR)
model.2 <- glm(fx ~ smoking + sex, data = df, family = binomial)
logistic.display(model)
##
## Logistic regression predicting fx
##
## OR(95%CI) P(Wald's test) P(LR-test)
## smoking: 1 vs 0 0.87 (0.71,1.06) 0.171 0.17
##
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938
summary(model.2)
##
## Call:
## glm(formula = fx ~ smoking + sex, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.85598 0.06835 -12.523 < 2e-16 ***
## smoking 0.09872 0.10724 0.921 0.357
## sexMale -0.78880 0.11493 -6.863 6.73e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2389.6 on 2159 degrees of freedom
## AIC: 2395.6
##
## Number of Fisher Scoring iterations: 4
exp(cbind(Odds_Ratio = coef(model.2), confint(model.2)))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 0.4248646 0.3711574 0.4852461
## smoking 1.1037584 0.8942327 1.3617209
## sexMale 0.4543898 0.3619039 0.5680105
model2.lessR = Logit(fx ~ smoking + sex, data = df)
##
## >>> Note: sex is not a numeric variable.
## Indicator variables are created and analyzed.
##
## Response Variable: fx
## Predictor Variable 1: smoking
## Predictor Variable 2: sexMale
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -0.8560 0.0684 -12.523 0.000 -0.9900 -0.7220
## smoking 0.0987 0.1072 0.921 0.357 -0.1115 0.3089
## sexMale -0.7888 0.1149 -6.863 0.000 -1.0141 -0.5635
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.4249 0.3716 0.4858
## smoking 1.1038 0.8945 1.3619
## sexMale 0.4544 0.3627 0.5692
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2389.599 on 2159 degrees of freedom
##
## AIC: 2395.599
##
## Number of iterations to convergence: 4
##
##
## Collinearity
##
## Tolerance VIF
## smoking 0.899 1.112
## sexMale 0.899 1.112
##
## ANALYSIS OF RESIDUALS AND INFLUENCE
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [res_rows = 20 out of 2162 cases (rows) of data]
## --------------------------------------------------------------------
## smoking sexMale fx P(Y=1) residual rstudent dffits cooks
## 18 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 183 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 212 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 217 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 221 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 263 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 308 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 323 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 491 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 545 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 561 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 563 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 628 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 680 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 744 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 762 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 770 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 812 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 821 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
## 951 0 1 1 0.1618 0.8382 1.911 0.07645 0.003063
##
##
## PREDICTION
##
## Probability threshold for classification : 0.5
##
##
## Data, Fitted Values, Standard Errors
## [sorted by fitted value]
## [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
## smoking sexMale fx label fitted std.err
## 3 0 1 0 0 0.1618 0.01548
## 14 0 1 0 0 0.1618 0.01548
## 18 0 1 1 0 0.1618 0.01548
## 24 0 1 0 0 0.1618 0.01548
##
## ... for the rows of data where fitted is close to 0.5 ...
##
## smoking sexMale fx label fitted std.err
## 2160 0 0 1 0 0.2982 0.01430
## 2162 0 0 1 0 0.2982 0.01430
## 4 1 0 0 0 0.3192 0.02074
## 20 1 0 0 0 0.3192 0.02074
## 32 1 0 0 0 0.3192 0.02074
##
## ... for the last 4 rows of sorted data ...
##
## smoking sexMale fx label fitted std.err
## 2146 1 0 1 0 0.3192 0.02074
## 2147 1 0 0 0 0.3192 0.02074
## 2148 1 0 0 0 0.3192 0.02074
## 2150 1 0 1 0 0.3192 0.02074
## --------------------------------------------------------------------
##
##
## ----------------------------
## Specified confusion matrices
## ----------------------------
##
## Probability threshold for predicting : 0.5
##
## Baseline Predicted
## ---------------------------------------------------
## Total %Tot 0 1 %Correct
## ---------------------------------------------------
## 1 545 25.2 545 0 0.0
## fx 0 1617 74.8 1617 0 100.0
## ---------------------------------------------------
## Total 2162 74.8
##
## Accuracy: 74.79
## Sensitivity: 0.00
## Precision: NaN
##Việc 4. Xây dựng mô hình dự báo nguy cơ gãy xương 4.1 Tìm mô hình tối
ưu bằng pp BMA
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
df_bma <- na.omit(df[, c("fx", "age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")]) ## bỏ giá trị trống na
dim(df_bma)
## [1] 2121 8
bma_model <- bic.glm(
x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")],
y = df_bma$fx,
glm.family = binomial()
)
summary(bma_model)
##
## Call:
## bic.glm.data.frame(x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")], y = df_bma$fx, glm.family = binomial())
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 2.249962 0.674258 2.509e+00 2.340e+00 1.140e+00
## age 16.2 0.002663 0.006838 . . 1.602e-02
## sex 18.2 -0.048048 0.114551 . -2.578e-01 .
## weight 0.0 0.000000 0.000000 . . .
## height 0.0 0.000000 0.000000 . . .
## fnbmd 100.0 -4.487636 0.436674 -4.595e+00 -4.279e+00 -4.302e+00
## smoking 0.0 0.000000 0.000000 . . .
## prior.fx 100.0 0.530729 0.134289 5.306e-01 5.445e-01 5.156e-01
##
## nVar 2 3 3
## BIC -1.402e+04 -1.402e+04 -1.402e+04
## post prob 0.696 0.142 0.122
## model 4
## Intercept 7.978e-01
## age 1.780e-02
## sex -2.851e-01
## weight .
## height .
## fnbmd -3.918e+00
## smoking .
## prior.fx 5.296e-01
##
## nVar 4
## BIC -1.402e+04
## post prob 0.040
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##4.2 Mô hình tối ưu
df_bma$fnbmd.sd = df_bma$fnbmd/0.15
bma_best = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)
exp(cbind(Odds_Ratio = coef(bma_best), confint(bma_best)))
## Waiting for profiling to be done...
## Odds_Ratio 2.5 % 97.5 %
## (Intercept) 12.2977330 6.6858383 22.888616
## fnbmd.sd 0.5019265 0.4467839 0.562099
## prior.fx 1.6999821 1.3051619 2.207473
m1 = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)
library(pROC)
pred = predict(m1, type = "response")
roc_curve = roc(df_bma$fx ~ pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.7043
ci(roc_curve)
## 95% CI: 0.6794-0.7292 (DeLong)
plot(roc_curve)
### Đánh giá tầm quan trọng của biến số
## Optional: danh gia tam quan trong bien so (relative importance)
library(randomForest)## gói lựa chọn tầm quan trọng của biến số
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
df_bma$fx1 = as.factor(df_bma$fx)
m.rf = randomForest(
fx1 ~ age + sex + weight + height + fnbmd + smoking + prior.fx,
data = df_bma,
importance = TRUE,
ntree = 1000
)
importance <- importance(m.rf, type = 1)
importance
## MeanDecreaseAccuracy
## age 15.487436
## sex 27.291154
## weight 17.011171
## height 31.310246
## fnbmd 40.775394
## smoking -2.421246
## prior.fx 23.559115
varImpPlot(m.rf, type = 1, main = "Relative Importance")