Ngày 3: Hồi qui logistic

Việc 1. Đọc dữ liệu ‘Bone data’

os = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Bone data.csv")
head(os)
##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0
dim(os)
## [1] 2162    9
head(os)
##   id    sex age weight height prior.fx fnbmd smoking fx
## 1  1   Male  73     98    175        0  1.08       1  0
## 2  2 Female  68     72    166        0  0.97       0  0
## 3  3   Male  68     87    184        0  1.01       0  0
## 4  4 Female  62     72    173        0  0.84       1  0
## 5  5   Male  61     72    173        0  0.81       1  0
## 6  6 Female  76     57    156        0  0.74       0  0

Việc 2. Đánh giá mối liên quan giữa hút thuốc và nguy cơ gãy xương

m.1 = glm(fx ~ smoking, family = binomial, data = os)
summary(m.1)
## 
## Call:
## glm(formula = fx ~ smoking, family = binomial, data = os)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.03006    0.06442 -15.990   <2e-16 ***
## smoking     -0.13796    0.10081  -1.368    0.171    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2441.4  on 2161  degrees of freedom
## Residual deviance: 2439.5  on 2160  degrees of freedom
## AIC: 2443.5
## 
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Warning: package 'epiDisplay' was built under R version 4.3.2
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
logistic.display(m.1)
## 
## Logistic regression predicting fx 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  0.171          0.17      
##                                                             
## Log-likelihood = -1219.7469
## No. of observations = 2162
## AIC value = 2443.4938

Việc 3. Đánh giá mối liên quan độc lập giữa hút thuốc và nguy cơ gãy xương

Giới tính là yếu tố gây nhiễu

library(compareGroups)
createTable(compareGroups(sex ~ smoking + fx, data = os))
## 
## --------Summary descriptives table by 'sex'---------
## 
## _________________________________________ 
##           Female       Male     p.overall 
##           N=1317       N=845              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## smoking 0.30 (0.46) 0.62 (0.49)  <0.001   
## fx      0.30 (0.46) 0.17 (0.38)  <0.001   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Hiệu chỉnh nhiễu

os$gender = ifelse(os$sex == "Male", 1, 0)
m.2 = glm(fx ~ smoking + gender, family = binomial, data = os)
logistic.display(m.2)
## 
## Logistic regression predicting fx 
##  
##                  crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
## smoking: 1 vs 0  0.87 (0.71,1.06)  1.1 (0.89,1.36)   0.357          0.358     
##                                                                               
## gender: 1 vs 0   0.47 (0.38,0.58)  0.45 (0.36,0.57)  < 0.001        < 0.001   
##                                                                               
## Log-likelihood = -1194.7997
## No. of observations = 2162
## AIC value = 2395.5994

Việc 4. Đánh giá tầm quan trọng của biến số

library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:epiDisplay':
## 
##     alpha
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
## 
##     dotplot
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
m.rel = glm(fx ~ gender + age + fnbmd + weight, family = binomial, data = os)
varImp(m.rel, scale = FALSE)
##          Overall
## gender 2.3022178
## age    2.5112611
## fnbmd  8.7193666
## weight 0.8239425

Việc 5. Xây dựng mô hình dự báo nguy cơ gãy xương

5.1 Tìm mô hình tối ưu bằng pp BMA

library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
os.2 = na.omit(os)
xvars = os.2[, c("sex", "age", "weight", "height", "fnbmd", "smoking")]
yvar = os.2[, c("fx")]

m.bma = bic.glm(xvars, yvar, strict = FALSE, OR = 20, glm.family = "binomial")
summary(m.bma)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = FALSE, OR = 20)
## 
## 
##   4  models were selected
##  Best  4  models (cumulative posterior probability =  1 ): 
## 
##            p!=0    EV        SD        model 1     model 2     model 3   
## Intercept  100     2.406528  0.791302   2.780e+00   1.271e+00   2.633e+00
## sex         13.8  -0.033579  0.095516       .           .      -2.344e-01
## age         23.0   0.004096  0.008381       .       1.753e-02       .    
## weight       0.0   0.000000  0.000000       .           .           .    
## height       0.0   0.000000  0.000000       .           .           .    
## fnbmd      100.0  -4.700740  0.436561  -4.818e+00  -4.489e+00  -4.536e+00
## smoking      0.0   0.000000  0.000000       .           .           .    
##                                                                          
## nVar                                      1           2           2      
## BIC                                    -1.402e+04  -1.401e+04  -1.401e+04
## post prob                               0.675       0.187       0.095    
##            model 4   
## Intercept   9.605e-01
## sex        -2.640e-01
## age         1.919e-02
## weight          .    
## height          .    
## fnbmd      -4.137e+00
## smoking         .    
##                      
## nVar          3      
## BIC        -1.401e+04
## post prob   0.042
imageplot.bma(m.bma)

5.2 Mô hình tối ưu

os.2$fnbmd.sd = os.2$fnbmd/0.15
m.3 = glm(fx ~ fnbmd.sd, family = binomial, data = os.2)
logistic.display(m.3)
## 
## Logistic regression predicting fx 
##  
##                       OR(95%CI)         P(Wald's test) P(LR-test)
## fnbmd.sd (cont. var.) 0.49 (0.43,0.54)  < 0.001        < 0.001   
##                                                                  
## Log-likelihood = -1106.8757
## No. of observations = 2121
## AIC value = 2217.7514

Việc 6. Dữ liệu “Pima Indian Diabetes”

library(readxl)
df = read_excel("C:\\Thach\\VN trips\\2024_4Dec\\TDT Uni\\Datasets\\Pima Indian Diabetes Dta.xlsx")
dim(df)
## [1] 768  10
head(df)
## # A tibble: 6 × 10
##      id Pregnancies Glucose BloodPressure SkinThickness Insulin   bmi
##   <dbl>       <dbl>   <dbl>         <dbl>         <dbl>   <dbl> <dbl>
## 1     1           6     148            72            35       0  33.6
## 2     2           1      85            66            29       0  26.6
## 3     3           8     183            64             0       0  23.3
## 4     4           1      89            66            23      94  28.1
## 5     5           0     137            40            35     168  43.1
## 6     6           5     116            74             0       0  25.6
## # ℹ 3 more variables: DiabetesPedigreeFunction <dbl>, Age <dbl>, Outcome <dbl>
df$y = ifelse(df$Outcome == 1, "Yes", "No")

Việc 7. Đánh giá mô hình bằng pp tách dữ liệu

7.1 Tách dữ liệu

library(caret)
set.seed(1234)

index = createDataPartition(df$y, p =.7, list = FALSE)
training = df[index, ]
  dim(training)
## [1] 538  11
testing = df[-index, ]
  dim(testing)
## [1] 230  11

7.2 Huấn luyện mô hình

fit = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = training, method = "glm", family = "binomial")
summary(fit)
## 
## Call:
## NULL
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.084008   0.824117  -9.809  < 2e-16 ***
## Age            0.015382   0.010763   1.429   0.1530    
## Pregnancies    0.094875   0.037380   2.538   0.0111 *  
## Glucose        0.032732   0.004010   8.162 3.30e-16 ***
## BloodPressure -0.007063   0.006114  -1.155   0.2480    
## bmi            0.090121   0.017343   5.196 2.03e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 528.11  on 532  degrees of freedom
## AIC: 540.11
## 
## Number of Fisher Scoring iterations: 5

7.3 Đánh giá mô hình

testing$y.2 = factor(testing$y, levels = c("Yes", "No"))
pred = predict(fit, newdata = testing, type = "raw")
pred.2 = factor(pred, levels = c("Yes", "No"))
confusionMatrix(testing$y.2, pred.2)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  49  31
##        No   18 132
##                                          
##                Accuracy : 0.787          
##                  95% CI : (0.7283, 0.838)
##     No Information Rate : 0.7087         
##     P-Value [Acc > NIR] : 0.004594       
##                                          
##                   Kappa : 0.5119         
##                                          
##  Mcnemar's Test P-Value : 0.086476       
##                                          
##             Sensitivity : 0.7313         
##             Specificity : 0.8098         
##          Pos Pred Value : 0.6125         
##          Neg Pred Value : 0.8800         
##              Prevalence : 0.2913         
##          Detection Rate : 0.2130         
##    Detection Prevalence : 0.3478         
##       Balanced Accuracy : 0.7706         
##                                          
##        'Positive' Class : Yes            
## 

7.4 Phân tích ROC

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
prob = predict(fit, newdata = testing, type = "prob")
pred = data.frame(prob, testing$Outcome)
head(pred)
##          No        Yes testing.Outcome
## 1 0.9426214 0.05737863               0
## 2 0.8273269 0.17267313               0
## 3 0.1647211 0.83527888               1
## 4 0.9560975 0.04390247               1
## 5 0.7117197 0.28828032               0
## 6 0.3953255 0.60467451               1
auc = roc(pred$testing.Outcome, pred$Yes)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc
## 
## Call:
## roc.default(response = pred$testing.Outcome, predictor = pred$Yes)
## 
## Data: pred$Yes in 150 controls (pred$testing.Outcome 0) < 80 cases (pred$testing.Outcome 1).
## Area under the curve: 0.8517
plot(auc)

Việc 8. Đánh giá mô hình bằng pp k-fold cross-validation

fit.cv = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = df, method = "glm", family = "binomial", trControl = trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary, classProbs = TRUE))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit.cv
## Generalized Linear Model 
## 
## 768 samples
##   5 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 692, 691, 691, 691, 692, ... 
## Resampling results:
## 
##   ROC        Sens   Spec    
##   0.8215698  0.874  0.574359

Việc 9. Đánh giá mô hình bằng pp bootstrap

fit.b = train(form = y ~ Age + Pregnancies + Glucose + BloodPressure + bmi, data = df, method = "glm", family = "binomial", trControl = trainControl(method = "boot", number = 100, summaryFunction = twoClassSummary, classProbs = TRUE))
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
fit.b
## Generalized Linear Model 
## 
## 768 samples
##   5 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Bootstrapped (100 reps) 
## Summary of sample sizes: 768, 768, 768, 768, 768, 768, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8254318  0.8732031  0.5683258

Việc 10. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs