Van Lang University Workshop on Machine Learning - Day 3: Logistic regression

Việc 1. Đọc dữ liệu arrest

arr = read.csv("C:\\Thach\\VLU workshop (Jun2023)\\Datasets\\Arrest data.csv", header = TRUE)
dim(arr)
## [1] 432  11
head(arr)
##   id week arrest finance age  race work     married parole prior educ
## 1  1   20      1      no  27 black   no not married    yes     3    3
## 2  2   17      1      no  18 black   no not married    yes     8    4
## 3  3   25      1      no  19 other  yes not married    yes    13    3
## 4  4   52      0     yes  23 black  yes     married    yes     1    5
## 5  5   52      0      no  19 other  yes not married    yes     3    3
## 6  6   52      0      no  24 black  yes not married     no     2    4

Việc 2. Đánh giá đặc điểm biến số theo tình trạng bị bắt lại (arrest)

library(compareGroups)
arr$educ = as.factor(arr$educ)
createTable(compareGroups(arrest ~ age + race + educ + prior + work + married + parole + finance, data = arr))
## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect

## Warning in chisq.test(xx, correct = FALSE): Chi-squared approximation may be
## incorrect
## 
## --------Summary descriptives table by 'arrest'---------
## 
## _________________________________________________ 
##                      0           1      p.overall 
##                    N=318       N=114              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age             25.3 (6.31) 22.8 (5.12)  <0.001   
## race:                                     0.621   
##     black       277 (87.1%) 102 (89.5%)           
##     other       41 (12.9%)  12 (10.5%)            
## educ:                                     0.023   
##     2           20 (6.29%)   4 (3.51%)            
##     3           162 (50.9%) 77 (67.5%)            
##     4           92 (28.9%)  27 (23.7%)            
##     5           34 (10.7%)   5 (4.39%)            
##     6           10 (3.14%)   1 (0.88%)            
## prior           2.70 (2.55) 3.77 (3.59)   0.004   
## work:                                     0.005   
##     no          123 (38.7%) 62 (54.4%)            
##     yes         195 (61.3%) 52 (45.6%)            
## married:                                  0.068   
##     married     45 (14.2%)   8 (7.02%)            
##     not married 273 (85.8%) 106 (93.0%)           
## parole:                                   0.660   
##     no          119 (37.4%) 46 (40.4%)            
##     yes         199 (62.6%) 68 (59.6%)            
## finance:                                  0.063   
##     no          150 (47.2%) 66 (57.9%)            
##     yes         168 (52.8%) 48 (42.1%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Việc 3. Đánh giá mối liên quan giữa tuổi (age) và khả năng bị bắt lại (arrest)

3.1 Vẽ biểu đồ hộp thể hiện mối liên quan giữa tuổi và bị bắt lại

library(ggplot2)
ggplot(arr, aes(group = arrest, x = arrest, y = age, fill = arrest, color = arrest)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = arrest)) + theme(legend.position = "none") + labs(x = "Arrest", y = "Age (years)")

3.2 Dùng hồi qui logistic để đánh giá mối liên quan. Diễn giải kết quả

m1 = glm(arrest ~ age, family = binomial, data = arr)
summary(m1)
## 
## Call:
## glm(formula = arrest ~ age, family = binomial, data = arr)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9909  -0.8362  -0.7246   1.4131   2.3509  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.95611    0.54357   1.759 0.078586 .  
## age         -0.08305    0.02296  -3.617 0.000298 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.60  on 431  degrees of freedom
## Residual deviance: 482.69  on 430  degrees of freedom
## AIC: 486.69
## 
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
logistic.display(m1)
## 
## Logistic regression predicting arrest 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## age (cont. var.) 0.92 (0.88,0.96)  < 0.001        < 0.001   
##                                                             
## Log-likelihood = -241.3436
## No. of observations = 432
## AIC value = 486.6872

3.3 Đánh giá khả năng dự báo của mô hình bằng phương pháp bootstrap

library(rms)
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Registered S3 method overwritten by 'rms':
##   method       from      
##   print.lrtest epiDisplay
## 
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
## 
##     lrtest
m1.v = lrm(arrest ~ age, data = arr, x = TRUE, y = TRUE)
validate = validate(m1.v, B = 100)
validate
##           index.orig training   test optimism index.corrected   n
## Dxy           0.2807   0.2906 0.2807   0.0099          0.2708 100
## R2            0.0528   0.0593 0.0528   0.0065          0.0463 100
## Intercept     0.0000   0.0000 0.0254  -0.0254          0.0254 100
## Slope         1.0000   1.0000 1.0117  -0.0117          1.0117 100
## Emax          0.0000   0.0000 0.0074   0.0074          0.0074 100
## D             0.0345   0.0392 0.0345   0.0047          0.0298 100
## U            -0.0046  -0.0046 0.0006  -0.0052          0.0006 100
## Q             0.0392   0.0439 0.0339   0.0099          0.0292 100
## B             0.1869   0.1847 0.1877  -0.0029          0.1899 100
## g             0.5278   0.5626 0.5278   0.0348          0.4930 100
## gp            0.0890   0.0923 0.0890   0.0034          0.0856 100

Dxy = 2(AUC - 0.5)

Emax = maximum difference between raw predicted probabilities and recalibrated probabilities

D (Discrimination index) = the difference in quality between the best constant predictor (one that on average predicts the overall prevalence of an event) and the best calibrated predictor.

U (Unreliable index) = unitless index of how far the logit calibration curve intercept and slope are from (0, 1)

Q (Overall quality = D - U) = The difference between the best constant predictor and the quality of predictions as they stand (no calibration).

B (Brier index) = The mean square error between predictions and observations.

g (Gini’s mean difference calculated on the log-OR scale) = model’s predictive discrimination performance based only on the fitted values.

gp (Gini’s mean difference calculated on the probability scale)

Việc 4. Đánh giá mối liên quan giữa hỗ trợ tài chánh (finance) và khả năng bị bắt lại (arrest)

4.1 Dùng mô hình hồi qui logistic đánh giá mối liên quan giữa hỗ trợ tài chánh và bị bắt lại

m2 = glm(arrest ~ finance, family = binomial, data = arr)
summary(m2)
## 
## Call:
## glm(formula = arrest ~ finance, family = binomial, data = arr)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.854  -0.854  -0.709   1.540   1.734  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.8210     0.1477  -5.558 2.73e-08 ***
## financeyes   -0.4318     0.2205  -1.959   0.0502 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.60  on 431  degrees of freedom
## Residual deviance: 494.73  on 430  degrees of freedom
## AIC: 498.73
## 
## Number of Fisher Scoring iterations: 4

4.2 So sánh khác biệt giữa tuổi và hỗ trợ tài chánh

ggplot(arr, aes(group = finance, x = finance, y = age, fill = finance, color = finance)) + geom_boxplot(colour = "black") + geom_jitter(aes(color = arrest)) + theme(legend.position = "none") + labs(x = "Finance", y = "Age (years)")

Người được hỗ trợ tài chánh có khuynh hướng lớn tuổi hơn so với người không nhận được hỗ trợ tài chánh.

4.3 Đánh giá ảnh hưởng của hỗ trợ tài chánh (finance) lên khả năng bị bắt lại sau khi hiệu chỉnh cho khác biệt về độ tuổi

m3 = glm(arrest ~ finance + age, family = binomial, data = arr)
summary(m3)
## 
## Call:
## glm(formula = arrest ~ finance + age, family = binomial, data = arr)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0633  -0.8193  -0.7107   1.3320   2.2949  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.11913    0.55462   2.018 0.043608 *  
## financeyes  -0.39905    0.22408  -1.781 0.074935 .  
## age         -0.08197    0.02309  -3.550 0.000385 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.60  on 431  degrees of freedom
## Residual deviance: 479.49  on 429  degrees of freedom
## AIC: 485.49
## 
## Number of Fisher Scoring iterations: 4

4.4 Đánh giá khả năng dự báo của mô hình bằng phương pháp bootstrap

library(rms)
m3.v = lrm(arrest ~ finance + age, data = arr, x = TRUE, y = TRUE)
validate = validate(m3.v, B = 100)
validate
##           index.orig training    test optimism index.corrected   n
## Dxy           0.2753   0.2863  0.2677   0.0185          0.2568 100
## R2            0.0632   0.0707  0.0599   0.0108          0.0524 100
## Intercept     0.0000   0.0000 -0.0028   0.0028         -0.0028 100
## Slope         1.0000   1.0000  0.9998   0.0002          0.9998 100
## Emax          0.0000   0.0000  0.0007   0.0007          0.0007 100
## D             0.0419   0.0476  0.0396   0.0080          0.0340 100
## U            -0.0046  -0.0046  0.0006  -0.0052          0.0006 100
## Q             0.0466   0.0522  0.0390   0.0132          0.0334 100
## B             0.1860   0.1850  0.1873  -0.0022          0.1882 100
## g             0.5899   0.6196  0.5693   0.0503          0.5396 100
## gp            0.1007   0.1045  0.0982   0.0062          0.0944 100

Việc 5. Đánh giá mô hình

5.1 Thực hiện mô hình

m4 = glm(arrest ~ age + finance + race + work + married + parole + educ, family = binomial, data = arr)
summary(m4)
## 
## Call:
## glm(formula = arrest ~ age + finance + race + work + married + 
##     parole + educ, family = binomial, data = arr)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1947  -0.8215  -0.6322   1.1923   2.4775  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)  
## (Intercept)         0.03062    0.99560   0.031   0.9755  
## age                -0.06042    0.02483  -2.433   0.0150 *
## financeyes         -0.46681    0.23074  -2.023   0.0431 *
## raceother          -0.39425    0.36908  -1.068   0.2854  
## workyes            -0.21147    0.24787  -0.853   0.3936  
## marriednot married  0.44496    0.42376   1.050   0.2937  
## paroleyes          -0.15446    0.23449  -0.659   0.5101  
## educ3               0.71294    0.59084   1.207   0.2276  
## educ4               0.23309    0.61778   0.377   0.7060  
## educ5              -0.24754    0.74909  -0.330   0.7411  
## educ6              -0.49929    1.20272  -0.415   0.6780  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.60  on 431  degrees of freedom
## Residual deviance: 466.61  on 421  degrees of freedom
## AIC: 488.61
## 
## Number of Fisher Scoring iterations: 4

5.2 Đánh giá khả năng dự báo của mô hình bằng phương pháp bootstrap

library(rms)
m4.v = lrm(arrest ~ age + finance + race + work + married + parole + educ, data = arr, x = TRUE, y = TRUE)
validate = validate(m4.v, B = 100)
validate
##           index.orig training    test optimism index.corrected   n
## Dxy           0.3721   0.4011  0.3281   0.0730          0.2991 100
## R2            0.1043   0.1358  0.0766   0.0593          0.0450 100
## Intercept     0.0000   0.0000 -0.2725   0.2725         -0.2725 100
## Slope         1.0000   1.0000  0.7034   0.2966          0.7034 100
## Emax          0.0000   0.0000  0.1314   0.1314          0.1314 100
## D             0.0718   0.0955  0.0516   0.0439          0.0279 100
## U            -0.0046  -0.0046  0.0118  -0.0165          0.0118 100
## Q             0.0764   0.1001  0.0397   0.0604          0.0160 100
## B             0.1795   0.1751  0.1835  -0.0084          0.1879 100
## g             0.7887   1.0213  0.6759   0.3455          0.4432 100
## gp            0.1341   0.1496  0.1107   0.0388          0.0953 100

5.3 Dùng package ‘caret’

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
## 
##     dotplot
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
sample = createDataPartition(arr$arrest, p = 0.6, list = FALSE)
train = arr[sample,]
test = arr[-sample,]

m5 = train(factor(arrest) ~ age + finance + race + work + married + parole + educ, data = train, method = "glm", family = "binomial")
summary(m5)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3047  -0.7942  -0.5433  -0.1653   2.5060  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)          -0.38353    1.54624  -0.248  0.80410   
## age                  -0.09677    0.03538  -2.735  0.00623 **
## financeyes           -0.37128    0.31091  -1.194  0.23241   
## raceother            -0.47629    0.48545  -0.981  0.32653   
## workyes              -0.37817    0.33718  -1.122  0.26204   
## `marriednot married`  0.75207    0.65967   1.140  0.25426   
## paroleyes            -0.32942    0.31538  -1.045  0.29625   
## educ3                 1.76447    1.08937   1.620  0.10529   
## educ4                 1.32948    1.11647   1.191  0.23374   
## educ5                 1.00789    1.33236   0.756  0.44937   
## educ6                 0.88281    1.53809   0.574  0.56599   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 290.20  on 259  degrees of freedom
## Residual deviance: 259.65  on 249  degrees of freedom
## AIC: 281.65
## 
## Number of Fisher Scoring iterations: 5
arrest.pred = predict(m5, newdata = test, type = "raw")
confusionMatrix(data = arrest.pred, reference = factor(test$arrest), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 118  46
##          1   4   4
##                                           
##                Accuracy : 0.7093          
##                  95% CI : (0.6353, 0.7759)
##     No Information Rate : 0.7093          
##     P-Value [Acc > NIR] : 0.5381          
##                                           
##                   Kappa : 0.0628          
##                                           
##  Mcnemar's Test P-Value : 6.7e-09         
##                                           
##             Sensitivity : 0.08000         
##             Specificity : 0.96721         
##          Pos Pred Value : 0.50000         
##          Neg Pred Value : 0.71951         
##              Prevalence : 0.29070         
##          Detection Rate : 0.02326         
##    Detection Prevalence : 0.04651         
##       Balanced Accuracy : 0.52361         
##                                           
##        'Positive' Class : 1               
## 

Test/Disease A B / C D Sen = A / (A + C) Spe = D / (B + D) PPV = A / (A + B) NPV = D / (C + D) Detection rate = A / N Detection Prevalence = (A + B)/ N Balanced Accuracy = (Sen + Spe) / 2

Việc 7. Tính AUC cho mô hình

prob = predict(m5, newdata = test, type = "prob")
pred = data.frame(prob, test$arrest)
head(pred)
##           X0         X1 test.arrest
## 3  0.7087968 0.29120316           1
## 4  0.9358528 0.06414719           0
## 7  0.8985259 0.10147412           1
## 8  0.7267284 0.27327155           0
## 9  0.4989971 0.50100288           0
## 10 0.7185329 0.28146712           0
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(pred$test.arrest, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = pred$test.arrest, predictor = pred$X1)
## 
## Data: pred$X1 in 122 controls (pred$test.arrest 0) < 50 cases (pred$test.arrest 1).
## Area under the curve: 0.6143
plot(smooth(roc(pred$test.arrest, pred$X1)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Việc 8. Bạn hãy ghi lại tất cả những hàm/lệnh trên trong RMarkdown và chia sẻ trên mạng rpubs.com/tài khoản của bạn (https://rpubs.com/ThachTran/1052255).