df = read.csv("/Users/trangan/Desktop/AI in Data Analysis Course/Data Practice/Bone data.csv")
dim(df)
## [1] 2162 9
head(df)
## id sex age weight height prior.fx fnbmd smoking fx
## 1 1 Male 73 98 175 0 1.08 1 0
## 2 2 Female 68 72 166 0 0.97 0 0
## 3 3 Male 68 87 184 0 1.01 0 0
## 4 4 Female 62 72 173 0 0.84 1 0
## 5 5 Male 61 72 173 0 0.81 1 0
## 6 6 Female 76 57 156 0 0.74 0 0
Gói lệnh cơ bản
model = glm(fx ~ smoking, data = df, family = binomial)
summary(model)
##
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.03006 0.06442 -15.990 <2e-16 ***
## smoking -0.13796 0.10081 -1.368 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2439.5 on 2160 degrees of freedom
## AIC: 2443.5
##
## Number of Fisher Scoring iterations: 4
exp(-.13796) # OR
## [1] 0.8711335
exp(-.13796 - 1.96*.10081) # tính toán khoảng tin cậy 95%
## [1] 0.7149465
exp(-.13796 + 1.96*.10081) # tính toán khoảng tin cậy 95%
## [1] 1.061441
library(lessR)
## Warning: package 'lessR' was built under R version 4.4.1
##
## lessR 4.5 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is the default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including modern time series forecasting
## and many, new Plotly interactive visualizations output. Most
## visualization functions are now reorganized to three functions:
## Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
## X(): type="histogram", "density", "vbs" and more
## XY(): type="scatter" for a scatterplot, or "contour", "smooth"
## Most previous function calls still work, such as:
## BarChart(), Histogram, and Plot().
## Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
##
## Interactive data analysis for constructing visualizations.
## Enter: interact()
model.lessR = Logit(fx~ smoking, data = df, brief = T)
##
## Response Variable: fx
## Predictor Variable 1: smoking
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -1.0301 0.0644 -15.990 0.000 -1.1563 -0.9038
## smoking -0.1380 0.1008 -1.368 0.171 -0.3355 0.0596
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.3570 0.3146 0.4050
## smoking 0.8711 0.7149 1.0614
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2439.494 on 2160 degrees of freedom
##
## AIC: 2443.494
##
## Number of iterations to convergence: 4
PROMT: tôi có dữ liệu đánh giá nguy cơ gãy xương. Bạn giúp viết lệnh xây dựng mô hình hối qui logistic đánh giá mối liên quan giữa hút thuốc (smoking: 0= No; 1= Yes) và nguy cơ gãy xương (fx: 0= No; 1= Yes)
model.2 = glm(fx ~ smoking + sex, data = df, family = binomial)
summary(model.2)
##
## Call:
## glm(formula = fx ~ smoking + sex, family = binomial, data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.85598 0.06835 -12.523 < 2e-16 ***
## smoking 0.09872 0.10724 0.921 0.357
## sexMale -0.78880 0.11493 -6.863 6.73e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2441.4 on 2161 degrees of freedom
## Residual deviance: 2389.6 on 2159 degrees of freedom
## AIC: 2395.6
##
## Number of Fisher Scoring iterations: 4
model2.lessR = Logit(fx ~ smoking + sex, data = df, brief=T)
##
## >>> Note: sex is not a numeric variable.
## Indicator variables are created and analyzed.
##
## Response Variable: fx
## Predictor Variable 1: smoking
## Predictor Variable 2: sexMale
##
## Number of cases (rows) of data: 2162
## Number of cases retained for analysis: 2162
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) -0.8560 0.0684 -12.523 0.000 -0.9900 -0.7220
## smoking 0.0987 0.1072 0.921 0.357 -0.1115 0.3089
## sexMale -0.7888 0.1149 -6.863 0.000 -1.0141 -0.5635
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 0.4249 0.3716 0.4858
## smoking 1.1038 0.8945 1.3619
## sexMale 0.4544 0.3627 0.5692
##
##
## -- Model Fit
##
## Null deviance: 2441.375 on 2161 degrees of freedom
## Residual deviance: 2389.599 on 2159 degrees of freedom
##
## AIC: 2395.599
##
## Number of iterations to convergence: 4
##
##
## Collinearity
##
## Tolerance VIF
## smoking 0.899 1.112
## sexMale 0.899 1.112
library(BMA)
## Warning: package 'BMA' was built under R version 4.4.1
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## Warning: package 'robustbase' was built under R version 4.4.1
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Warning: package 'inline' was built under R version 4.4.1
## Loading required package: rrcov
## Warning: package 'rrcov' was built under R version 4.4.1
## Scalable Robust Estimators with High Breakdown Point (version 1.7-7)
df_bma <- na.omit(df[, c("fx", "age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")])
dim(df_bma)
## [1] 2121 8
dim(df)
## [1] 2162 9
df_bma <- na.omit(df[, c("fx", "age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")])
bma_model <- bic.glm(
x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")],
y = df_bma$fx,
glm.family = binomial()
)
summary(bma_model)
##
## Call:
## bic.glm.data.frame(x = df_bma[, c("age", "sex", "weight", "height", "fnbmd", "smoking", "prior.fx")], y = df_bma$fx, glm.family = binomial())
##
##
## 4 models were selected
## Best 4 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 2.249962 0.674258 2.509e+00 2.340e+00 1.140e+00
## age 16.2 0.002663 0.006838 . . 1.602e-02
## sex 18.2 -0.048048 0.114551 . -2.578e-01 .
## weight 0.0 0.000000 0.000000 . . .
## height 0.0 0.000000 0.000000 . . .
## fnbmd 100.0 -4.487636 0.436674 -4.595e+00 -4.279e+00 -4.302e+00
## smoking 0.0 0.000000 0.000000 . . .
## prior.fx 100.0 0.530729 0.134289 5.306e-01 5.445e-01 5.156e-01
##
## nVar 2 3 3
## BIC -1.402e+04 -1.402e+04 -1.402e+04
## post prob 0.696 0.142 0.122
## model 4
## Intercept 7.978e-01
## age 1.780e-02
## sex -2.851e-01
## weight .
## height .
## fnbmd -3.918e+00
## smoking .
## prior.fx 5.296e-01
##
## nVar 4
## BIC -1.402e+04
## post prob 0.040
m1 = Logit(fx ~ fnbmd + prior.fx, data = df_bma, brief = T)
##
## Response Variable: fx
## Predictor Variable 1: fnbmd
## Predictor Variable 2: prior.fx
##
## Number of cases (rows) of data: 2121
## Number of cases retained for analysis: 2121
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) 2.5094 0.3138 7.997 0.000 1.8944 3.1245
## fnbmd -4.5953 0.3903 -11.773 0.000 -5.3604 -3.8303
## prior.fx 0.5306 0.1340 3.961 0.000 0.2680 0.7932
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 12.2977 6.6484 22.7474
## fnbmd 0.0101 0.0047 0.0217
## prior.fx 1.7000 1.3074 2.2105
##
##
## -- Model Fit
##
## Null deviance: 2395.806 on 2120 degrees of freedom
## Residual deviance: 2198.504 on 2118 degrees of freedom
##
## AIC: 2204.504
##
## Number of iterations to convergence: 4
##
##
## Collinearity
##
## Tolerance VIF
## fnbmd 0.973 1.027
## prior.fx 0.973 1.027
nếu có hệ số OR > 0.01 thì phải đặt câu hỏi
summary(df_bma)
## fx age sex weight
## Min. :0.0000 Min. :57.00 Length:2121 Min. : 34.00
## 1st Qu.:0.0000 1st Qu.:65.00 Class :character 1st Qu.: 60.00
## Median :0.0000 Median :69.00 Mode :character Median : 69.00
## Mean :0.2522 Mean :70.56 Mean : 70.08
## 3rd Qu.:1.0000 3rd Qu.:75.00 3rd Qu.: 79.00
## Max. :1.0000 Max. :94.00 Max. :133.00
## height fnbmd smoking prior.fx
## Min. :136.0 Min. :0.2800 Min. :0.0000 Min. :0.0000
## 1st Qu.:158.0 1st Qu.:0.7300 1st Qu.:0.0000 1st Qu.:0.0000
## Median :164.0 Median :0.8200 Median :0.0000 Median :0.0000
## Mean :164.9 Mean :0.8287 Mean :0.4234 Mean :0.1542
## 3rd Qu.:171.0 3rd Qu.:0.9300 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :196.0 Max. :1.5100 Max. :1.0000 Max. :1.0000
df_bma$fnbmd.sd = df_bma$fnbmd/0.15
m2 = Logit(fx ~ fnbmd.sd + prior.fx, data = df_bma, brief = T)
##
## Response Variable: fx
## Predictor Variable 1: fnbmd.sd
## Predictor Variable 2: prior.fx
##
## Number of cases (rows) of data: 2121
## Number of cases retained for analysis: 2121
##
##
## BASIC ANALYSIS
##
## -- Estimated Model of fx for the Logit of Reference Group Membership
##
## Estimate Std Err z-value p-value Lower 95% Upper 95%
## (Intercept) 2.5094 0.3138 7.997 0.000 1.8944 3.1245
## fnbmd.sd -0.6893 0.0585 -11.773 0.000 -0.8041 -0.5745
## prior.fx 0.5306 0.1340 3.961 0.000 0.2680 0.7932
##
##
## -- Odds Ratios and Confidence Intervals
##
## Odds Ratio Lower 95% Upper 95%
## (Intercept) 12.2977 6.6484 22.7474
## fnbmd.sd 0.5019 0.4475 0.5630
## prior.fx 1.7000 1.3074 2.2105
##
##
## -- Model Fit
##
## Null deviance: 2395.806 on 2120 degrees of freedom
## Residual deviance: 2198.504 on 2118 degrees of freedom
##
## AIC: 2204.504
##
## Number of iterations to convergence: 4
##
##
## Collinearity
##
## Tolerance VIF
## fnbmd.sd 0.973 1.027
## prior.fx 0.973 1.027
m1 = glm(fx ~ fnbmd.sd + prior.fx, family = binomial, data = df_bma)
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.1
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pred = predict(m1, type = "response") # dự báo mô hình m1 và dự báo biến kết cục
roc_curve = roc(df_bma$fx ~ pred) # so giá trị thật trong dữ liệu df_bma với giá trị pre
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_curve)
## Area under the curve: 0.7043
ci(roc_curve)
## 95% CI: 0.6794-0.7292 (DeLong)
plot(roc_curve)
# đánh giá mức độ chính xác: calibration
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("rms", dependencies = TRUE)
##
## The downloaded binary packages are in
## /var/folders/f3/6ztcxq7x3vg1by61h3pk_5yr0000gn/T//RtmpyXBxqz/downloaded_packages
library(rms)
## Warning: package 'rms' was built under R version 4.4.1
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 4.4.1
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:lessR':
##
## label, Merge
## The following objects are masked from 'package:base':
##
## format.pval, units
f = lrm(fx ~ fnbmd.sd + prior.fx, data = df_bma)
pred.logit = predict (f)
phat = 1/(1+exp(-pred.logit))
val.prob(phat, df_bma$fx, m = 20, cex = .5)
## Dxy C (ROC) R2 D D:Chi-sq
## 4.085621e-01 7.042810e-01 1.312413e-01 9.255161e-02 1.973020e+02
## D:p U U:Chi-sq U:p Q
## 0.000000e+00 -9.429514e-04 -2.273737e-12 1.000000e+00 9.349456e-02
## Brier Intercept Slope Emax E90
## 1.716198e-01 -1.126340e-14 1.000000e+00 2.512248e-01 3.359625e-02
## Eavg S:z S:p
## 1.689702e-02 1.321099e-01 8.948974e-01
library(randomForest) # random forest là 1 thuật toán của machine learning
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
df_bma$fx1 = as.factor(df_bma$fx)
m.rf = randomForest(fx1 ~ age + sex + weight + height + fnbmd + smoking + prior.fx, data = df_bma, importance = TRUE, ntrees = 1000) # chạy mô hình này xong sau đó bỏ từng biến số đó ra mà mưcs độ chính xác của mô hình đó giảm đáng kể thì biến số đó quan trọng
importance = importance(m.rf, type = 1)
importance
## MeanDecreaseAccuracy
## age 10.2549325
## sex 19.5957033
## weight 12.0660072
## height 22.4050856
## fnbmd 27.9474592
## smoking 0.2148956
## prior.fx 16.1710213
varImpPlot(m.rf, type = 1, main = "relative importance")
Collins GS. BMJ 2024