Ngày 3: Hồi qui logistic

Việc 1. Đọc dữ liệu ‘Bone data’

os = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Bone data.csv")
head(os)

##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0

dim(os)

## [1] 2162    9

head(os)

##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0

Việc 2. Đánh giá mối liên quan giữa hút thuốc và nguy cơ gãy xương

m.1 = glm(fx ~ smoking, family = binomial, data = os)
summary(m.1)

## 
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = os)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4

library(epiDisplay)

## Warning: package 'epiDisplay' was built under R version 4.3.2

## Loading required package: foreign

## Loading required package: survival

## Loading required package: MASS

## Loading required package: nnet

logistic.display(m.1)

## 
## Logistic regression predicting fx 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  0.171          0.17      
##                                                             
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938

Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương

Giới tính là yếu tố gây nhiễu

library(compareGroups)
createTable(compareGroups(sex ~ smoking + fx, data = os))

## 
## --------Summary descriptives table by 'sex'---------
## 
## _________________________________________ 
##           Female       Male     p.overall 
##           N=1317       N=845              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## smoking 0.30 (0.46) 0.62 (0.49)  <0.001   
## fx      0.30 (0.46) 0.17 (0.38)  <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Hiệu chỉnh nhiễu

os$gender = ifelse(os$sex == "Male", 1, 0)
m.2 = glm(fx ~ smoking + gender, family = binomial, data = os)
logistic.display(m.2)

## 
## Logistic regression predicting fx 
##  
##                  crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  1.1 (0.89,1.36)   0.357          0.358     
##                                                                               
## gender: 1 vs 0   0.47 (0.38,0.58)  0.45 (0.36,0.57)  < 0.001        < 0.001   
##                                                                               
## Log-likelihood = -1194.7997
## No. of observations = 2162
## AIC value = 2395.5994

Việc 4. Đánh giá tầm quan trọng của biến số

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:epiDisplay':
## 
##     alpha

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:epiDisplay':
## 
##     dotplot

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

m.rel = glm(fx ~ gender + age + fnbmd + weight, family = binomial, data = os)
varImp(m.rel, scale = FALSE)

##          Overall
## gender 2.3022178
## age    2.5112611
## fnbmd  8.7193666
## weight 0.8239425

Việc 5. Xây dựng mô hình dự báo nguy cơ gãy xương

5.1 Tìm mô hình tối ưu bằng pp BMA

library(BMA)

## Loading required package: leaps

## Loading required package: robustbase

## 
## Attaching package: 'robustbase'

## The following object is masked from 'package:survival':
## 
##     heart

## Loading required package: inline

## Loading required package: rrcov

## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)

os.2 = na.omit(os)
xvars = os.2[, c("sex", "age", "weight", "height", "fnbmd", "smoking")]
yvar = os.2[, c("fx")]

m.bma = bic.glm(xvars, yvar, strict = FALSE, OR = 20, glm.family = "binomial")
summary(m.bma)

## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = FALSE, OR = 20)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100     2.406528  0.791302   2.780e+00   1.271e+00   2.633e+00
## sex         13.8  -0.033579  0.095516       .           .      -2.344e-01
## age         23.0   0.004096  0.008381       .       1.753e-02       .    
## weight       0.0   0.000000  0.000000       .           .           .    
## height       0.0   0.000000  0.000000       .           .           .    
## fnbmd      100.0  -4.700740  0.436561  -4.818e+00  -4.489e+00  -4.536e+00
## smoking      0.0   0.000000  0.000000       .           .           .    
##                                                                          
## nVar                                      1           2           2      
## BIC                                    -1.402e+04  -1.401e+04  -1.401e+04
## post prob                               0.675       0.187       0.095    
##            model 4   
## Intercept   9.605e-01
## sex        -2.640e-01
## age         1.919e-02
## weight          .    
## height          .    
## fnbmd      -4.137e+00
## smoking         .    
##                      
## nVar          3      
## BIC        -1.401e+04
## post prob   0.042

imageplot.bma(m.bma)

5.2 Mô hình tối ưu

os.2$fnbmd.sd = os.2$fnbmd/0.15
m.3 = glm(fx ~ fnbmd.sd, family = binomial, data = os.2)
logistic.display(m.3)

## 
## Logistic regression predicting fx 
##  
##                       OR(95%CI)         P(Wald's test) P(LR-test)
## fnbmd.sd (cont. var.) 0.49 (0.43,0.54)  < 0.001        < 0.001   
##                                                                  
## Log-likelihood = -1106.8757
## No. of observations = 2121
## AIC value = 2217.7514

Việc 6. Dữ liệu “Pima Indian Diabetes”

library(readxl)
df = read_excel("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Pima Indian Diabetes Dta.xlsx")
dim(df)

## [1] 768  10

head(df)

## # A tibble: 6 × 10
##      id Pregnancies Glucose BloodPressure SkinThickness Insulin   bmi
##   <dbl>       <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1     1           6     148            72            35       0  33.6
## 2     2           1      85            66            29       0  26.6
## 3     3           8     183            64             0       0  23.3
## 4     4           1      89            66            23      94  28.1
## 5     5           0     137            40            35     168  43.1
## 6     6           5     116            74             0       0  25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>

df$y = ifelse(df$Outcome == 1, "Yes", "No")

Việc 7. Đánh giá mô hình bằng pp tách dữ liệu

7.1 Tách dữ liệu

library(caret)
set.seed(1234)

index = createDataPartition(df$y, p =.7, list = FALSE)
training = df[index, ]
  dim(training)

## [1] 538  11

testing = df[-index, ]
  dim(testing)

## [1] 230  11

7.2 Huấn luyện mô hình

fit = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = training, method = "glm", family = "binomial")
summary(fit)

## 
## Call:
## NULL
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.084008   0.824117  -9.809  < 2e-16 ***
## Age            0.015382   0.010763   1.429   0.1530    
## Pregnancies    0.094875   0.037380   2.538   0.0111 *  
## Glucose        0.032732   0.004010   8.162 3.30e-16 ***
## BloodPressure -0.007063   0.006114  -1.155   0.2480    
## bmi            0.090121   0.017343   5.196 2.03e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 528.11  on 532  degrees of freedom
## AIC: 540.11
## 
## Number of Fisher Scoring iterations: 5

7.3 Đánh giá mô hình

testing$y.2 = factor(testing$y, levels = c("Yes", "No"))
pred = predict(fit, newdata = testing, type = "raw")
pred.2 = factor(pred, levels = c("Yes", "No"))
confusionMatrix(testing$y.2, pred.2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  49  31
##        No   18 132
##                                          
##                Accuracy : 0.787          
##                  95% CI : (0.7283, 0.838)
##     No Information Rate : 0.7087         
##     P-Value [Acc > NIR] : 0.004594       
##                                          
##                   Kappa : 0.5119         
##                                          
##  Mcnemar's Test P-Value : 0.086476       
##                                          
##             Sensitivity : 0.7313         
##             Specificity : 0.8098         
##          Pos Pred Value : 0.6125         
##          Neg Pred Value : 0.8800         
##              Prevalence : 0.2913         
##          Detection Rate : 0.2130         
##    Detection Prevalence : 0.3478         
##       Balanced Accuracy : 0.7706         
##                                          
##        'Positive' Class : Yes            
##

7.4 Phân tích ROC

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following object is masked from 'package:epiDisplay':
## 
##     ci

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

prob = predict(fit, newdata = testing, type = "prob")
pred = data.frame(prob, testing$Outcome)
head(pred)

##          No        Yes testing.Outcome
## 1 0.9426214 0.05737863               0
## 2 0.8273269 0.17267313               0
## 3 0.1647211 0.83527888               1
## 4 0.9560975 0.04390247               1
## 5 0.7117197 0.28828032               0
## 6 0.3953255 0.60467451               1

auc = roc(pred$testing.Outcome, pred$Yes)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc

## 
## Call:
## roc.default(response = pred$testing.Outcome, predictor = pred$Yes)
## 
## Data: pred$Yes in 150 controls (pred$testing.Outcome 0) < 80 cases (pred$testing.Outcome 1).
## Area under the curve: 0.8517

plot(auc)

Việc 8. Đánh giá mô hình bằng pp k-fold cross-validation

fit.cv = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = df, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary, classProbs = TRUE))

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.

fit.cv

## Generalized Linear Model 
## 
## 768 samples
##   5 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 692, 691, 691, 691, 692, ... 
## Resampling results:
## 
##   ROC        Sens   Spec    
##   0.8215698  0.874  0.574359

Việc 9. Đánh giá mô hình bằng pp bootstrap

fit.b = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = df, method = "glm", family = "binomial", trControl = trainControl(method = "boot", number = 100, summaryFunction = twoClassSummary, classProbs = TRUE))

## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.

fit.b

## Generalized Linear Model 
## 
## 768 samples
##   5 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (100 reps) 
## Summary of sample sizes: 768, 768, 768, 768, 768, 768, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8254318  0.8732031  0.5683258

Xây dựng Mô hình Tiên lượng dùng Machine Learning - ĐH Tôn Đức Thắng (11-13/1/2025)

Thach Tran

2024-1-9

Ngày 3: Hồi qui logistic

Việc 1. Đọc dữ liệu ‘Bone data’

Việc 2. Đánh giá mối liên quan giữa hút thuốc và nguy cơ gãy xương

Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương

Việc 4. Đánh giá tầm quan trọng của biến số

Việc 5. Xây dựng mô hình dự báo nguy cơ gãy xương

5.1 Tìm mô hình tối ưu bằng pp BMA

5.2 Mô hình tối ưu

Việc 6. Dữ liệu “Pima Indian Diabetes”

Việc 7. Đánh giá mô hình bằng pp tách dữ liệu

7.1 Tách dữ liệu

7.2 Huấn luyện mô hình

7.3 Đánh giá mô hình

7.4 Phân tích ROC

Việc 8. Đánh giá mô hình bằng pp k-fold cross-validation

Việc 9. Đánh giá mô hình bằng pp bootstrap

Việc 10. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs