Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV SIS Cần Thơ (2-6/1/2025)

Ngày 5: Hồi qui logistic

Việc 1. Đánh giá liên quan giữa trình độ học vấn và nhận xét về lí do TT Clinton tái đắc cử 1996

1.1 Nhập liệu nhanh

df = matrix(c(91, 104, 235, 39, 73, 48, 18, 31, 161), nrow = 3, byrow = TRUE)

1.2 Thực hiện phép kiểm ki bình phương

chisq.test(df)
## 
##  Pearson's Chi-squared test
## 
## data:  df
## X-squared = 86.023, df = 4, p-value < 2.2e-16

Việc 2. Đánh giá xem nguy cơ tiểu đường có khác nhau giữa nam và nữ không?

2.1 Đọc dữ liệu vào R

df = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\diabetes data.csv")
head(df)
##   id age    sex height weight systBP diastBP brother parents hypertension   bmi
## 1  1  54 Female    150   68.5    150      80       0       0            1 30.44
## 2  2  62 Female    165   75.5    170      80       0       0            1 27.73
## 3  3  49 Female    152   71.0    130      90       1       1            1 30.73
## 4  4  45 Female    154   58.0    110      90       0       1            1 24.46
## 5  5  54 Female    151   54.0    100      80       1       1            0 23.68
## 6  6  46 Female    156   76.0    120      80       0       1            0 31.23
##    whr nrisks  group
## 1 0.85      3 Normal
## 2 1.00      3    IFG
## 3 0.82      4 Normal
## 4 0.81      3 Normal
## 5 0.80      3 Normal
## 6 0.92      3 Normal

2.2 Tạo biến số mới diab

df$diab = ifelse(df$group == "DM", 1, 0)
head(df)
##   id age    sex height weight systBP diastBP brother parents hypertension   bmi
## 1  1  54 Female    150   68.5    150      80       0       0            1 30.44
## 2  2  62 Female    165   75.5    170      80       0       0            1 27.73
## 3  3  49 Female    152   71.0    130      90       1       1            1 30.73
## 4  4  45 Female    154   58.0    110      90       0       1            1 24.46
## 5  5  54 Female    151   54.0    100      80       1       1            0 23.68
## 6  6  46 Female    156   76.0    120      80       0       1            0 31.23
##    whr nrisks  group diab
## 1 0.85      3 Normal    0
## 2 1.00      3    IFG    0
## 3 0.82      4 Normal    0
## 4 0.81      3 Normal    0
## 5 0.80      3 Normal    0
## 6 0.92      3 Normal    0

2.3 Sử dụng hồi qui logistic

library(lessR)
## Warning: package 'lessR' was built under R version 4.3.3
## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
m.1 = Logit(diab ~ sex, brief = TRUE, data = df)
## 
## >>> Note:  sex is not a numeric variable.
##            Indicator variables are created and analyzed.
## 
## Response Variable:   diab
## Predictor Variable 1:  sexMale
## 
## Number of cases (rows) of data:  3165 
## Number of cases retained for analysis:  3165 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of diab for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -2.6174     0.0854  -30.637    0.000     -2.7849     -2.4500 
##     sexMale    0.3590     0.1376    2.609    0.009      0.0893      0.6286 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.0730      0.0617      0.0863 
##     sexMale      1.4319      1.0935      1.8750 
## 
## 
## -- Model Fit
## 
##     Null deviance: 1709.356 on 3164 degrees of freedom
## Residual deviance: 1702.715 on 3163 degrees of freedom
## 
## AIC: 1706.715 
## 
## Number of iterations to convergence: 5

Việc 3. Đánh giá mối liên quan giữa hút thuốc trong lúc mang thai và tình trạng trẻ sanh thiếu cân

3.1 Đọc dữ liệu vào R

bw = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\birthwt.csv")
head(bw)
##   id low age lwt race smoke ptl ht ui ftv  bwt
## 1 85   0  19 182    2     0   0  0  1   0 2523
## 2 86   0  33 155    3     0   0  0  0   3 2551
## 3 87   0  20 105    1     1   0  0  0   1 2557
## 4 88   0  21 108    1     1   0  0  1   2 2594
## 5 89   0  18 107    1     1   0  0  1   0 2600
## 6 91   0  21 124    3     0   0  0  0   0 2622

3.2 Đánh giá mới liên quan giữa hút thuốc trong lúc mang thai và tình trạng trẻ sanh thiếu cân

library(lessR)
Logit(low ~ factor(smoke), data = bw, brief = TRUE)
## 
## Response Variable:   low
## Predictor Variable 1:  smoke
## 
## Number of cases (rows) of data:  189 
## Number of cases retained for analysis:  189 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of low for the Logit of Reference Group Membership
## 
##                 Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
##    (Intercept)   -1.0871     0.2147   -5.062    0.000     -1.5079     -0.6662 
## factor(smoke)1    0.7041     0.3196    2.203    0.028      0.0776      1.3305 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##                 Odds Ratio   Lower 95%   Upper 95%
##    (Intercept)      0.3372      0.2214      0.5137 
## factor(smoke)1      2.0219      1.0807      3.7831 
## 
## 
## -- Model Fit
## 
##     Null deviance: 234.672 on 188 degrees of freedom
## Residual deviance: 229.805 on 187 degrees of freedom
## 
## AIC: 233.8046 
## 
## Number of iterations to convergence: 4

3.3 Đánh giá mới liên quan ĐỘC LẬP giữa hút thuốc trong lúc mang thai và tình trạng trẻ sanh thiếu cân

Logit(low ~ factor(smoke) + age + factor(race), data = bw, brief = TRUE)
## 
## Response Variable:   low
## Predictor Variable 1:  smoke
## Predictor Variable 2:  age
## Predictor Variable 3:  race
## 
## Number of cases (rows) of data:  189 
## Number of cases retained for analysis:  189 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of low for the Logit of Reference Group Membership
## 
##                 Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
##    (Intercept)   -1.0076     0.8617   -1.169    0.242     -2.6964      0.6813 
## factor(smoke)1    1.1006     0.3719    2.959    0.003      0.3716      1.8295 
##            age   -0.0349     0.0334   -1.044    0.296     -0.1004      0.0306 
##  factor(race)2    1.0114     0.4934    2.050    0.040      0.0443      1.9785 
##  factor(race)3    1.0567     0.4060    2.603    0.009      0.2611      1.8524 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##                 Odds Ratio   Lower 95%   Upper 95%
##    (Intercept)      0.3651      0.0674      1.9764 
## factor(smoke)1      3.0058      1.4500      6.2311 
##            age      0.9657      0.9045      1.0311 
##  factor(race)2      2.7495      1.0453      7.2319 
##  factor(race)3      2.8769      1.2983      6.3751 
## 
## 
## -- Model Fit
## 
##     Null deviance: 234.672 on 188 degrees of freedom
## Residual deviance: 218.862 on 184 degrees of freedom
## 
## AIC: 228.8622 
## 
## Number of iterations to convergence: 4 
## 
## 
## Collinearity
## 
##       Tolerance       VIF
## smoke     0.874     1.144
## age       0.945     1.058
## race      0.123     8.124

Việc 4. Xây dựng mô hình dự báo trẻ sanh thiếu cân

4.1 Phương pháp stepwise

Chọn tập tin mới loại bỏ ID và weight

bw1 = bw[, c("low", "age", "lwt", "race", "smoke", "ptl", "ht", "ui", "ftv")]
head(bw1)
##   low age lwt race smoke ptl ht ui ftv
## 1   0  19 182    2     0   0  0  1   0
## 2   0  33 155    3     0   0  0  0   3
## 3   0  20 105    1     1   0  0  0   1
## 4   0  21 108    1     1   0  0  1   2
## 5   0  18 107    1     1   0  0  1   0
## 6   0  21 124    3     0   0  0  0   0

Phương pháp stepwise

m.0 = glm(low ~ ., family = binomial, data = bw1)
step(m.0)
## Start:  AIC=222.19
## low ~ age + lwt + race + smoke + ptl + ht + ui + ftv
## 
##         Df Deviance    AIC
## - ftv    1   204.33 220.33
## - age    1   205.18 221.18
## <none>       204.19 222.19
## - ui     1   206.58 222.58
## - ptl    1   206.72 222.72
## - lwt    1   208.03 224.03
## - race   1   208.75 224.75
## - smoke  1   209.89 225.89
## - ht     1   211.52 227.52
## 
## Step:  AIC=220.33
## low ~ age + lwt + race + smoke + ptl + ht + ui
## 
##         Df Deviance    AIC
## - age    1   205.21 219.21
## <none>       204.33 220.33
## - ui     1   206.67 220.67
## - ptl    1   206.83 220.83
## - lwt    1   208.04 222.04
## - race   1   208.77 222.77
## - smoke  1   209.91 223.91
## - ht     1   211.53 225.53
## 
## Step:  AIC=219.21
## low ~ lwt + race + smoke + ptl + ht + ui
## 
##         Df Deviance    AIC
## <none>       205.21 219.21
## - ptl    1   207.34 219.34
## - ui     1   207.81 219.81
## - lwt    1   209.58 221.58
## - race   1   210.31 222.31
## - smoke  1   211.17 223.17
## - ht     1   212.61 224.61
## 
## Call:  glm(formula = low ~ lwt + race + smoke + ptl + ht + ui, family = binomial, 
##     data = bw1)
## 
## Coefficients:
## (Intercept)          lwt         race        smoke          ptl           ht  
##    -0.80364     -0.01295      0.46913      0.94817      0.49145      1.83316  
##          ui  
##     0.74794  
## 
## Degrees of Freedom: 188 Total (i.e. Null);  182 Residual
## Null Deviance:       234.7 
## Residual Deviance: 205.2     AIC: 219.2

4.2 Phương pháp BMA

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
xvars = bw1[, -1]
yvar = bw1[, 1]
m.bma = bic.glm(xvars, yvar, strict = FALSE, OR = 20, glm.family = binomial)
summary(m.bma)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = binomial,     strict = FALSE, OR = 20)
## 
## 
##   84  models were selected
##  Best  5  models (cumulative posterior probability =  0.2531 ): 
## 
##            p!=0   EV        SD        model 1     model 2     model 3   
## Intercept  100   -0.390128  1.575728     1.45068     1.09291    -2.32488
## age        10.4  -0.004815  0.018070       .           .           .    
## lwt        54.8  -0.008473  0.009253    -0.01865    -0.01707       .    
## race       44.3   0.212462  0.280368       .           .         0.55898
## smoke      52.1   0.484523  0.552668       .           .         1.11668
## ptl        41.2   0.291512  0.410590       .         0.72560       .    
## ht         59.7   1.011382  0.999519     1.85551     1.85604       .    
## ui         30.0   0.263111  0.470489       .           .           .    
## ftv         2.0  -0.001015  0.024588       .           .           .    
##                                                                         
## nVar                                       2           3           2    
## BIC                                   -753.82285  -753.75940  -753.62525
## post prob                                0.058       0.056       0.052  
##            model 4     model 5   
## Intercept    -0.35754     1.06795
## age             .           .    
## lwt          -0.01535    -0.01692
## race          0.48955       .    
## smoke         1.08002       .    
## ptl             .           .    
## ht            1.74427     1.96157
## ui              .         0.93000
## ftv             .           .    
##                                  
## nVar            4           3    
## BIC        -753.44086  -753.11035
## post prob     0.048       0.040
imageplot.bma(m.bma)

Việc 5. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs