Data Analysis Workshop - Phuong Nam Institute (6-11/1/2026)

Phân tích Propensity Score

library(compareGroups)
library(MatchIt)

## Warning: package 'MatchIt' was built under R version 4.3.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Việc 1. Đọc dữ liệu vào R

ps = read.csv("C:\\Thach\\VN trips\\2026_1Jan\\Data Analysis Workshop_PN Institute\\Data\\ps_data.csv")
head(ps)

##   id  age sex weight height smoking activity heart hypert diabetes cancer lung
## 1  1 57.0   1   99.0  165.0       0        0     0      1        0      1    1
## 2  2 57.0   1   80.9  160.0       1        1     0      1        0      1    1
## 3  3 76.5   0   75.7  175.9       0        0     0      1        0      1    0
## 4  4 72.0   1   59.0  151.0       0        1     0      0        0      0    0
## 5  5 85.5   0   55.5  157.0       0        0     0      0        0      1    0
## 6  6 48.0   1   94.0  158.5       0        0     0      1        0      0    0
##   treat death
## 1     1     0
## 2     1     0
## 3     0     0
## 4     1     0
## 5     0     1
## 6     1     0

Việc 2. Mô tả đặc điểm cơ bản

ps = ps %>% mutate(sex = as.character(sex),
                   smoking = as.character(smoking),
                   activity = as.character(activity),
                   heart = as.character(heart),
                   hypert = as.character(hypert),
                   diabetes = as.character(diabetes),
                   cancer = as.character(cancer),
                   lung = as.character(lung),
                   death.c = as.character(death))
                   
des.1 = compareGroups(treat ~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps)
createTable(des.1)

## 
## --------Summary descriptives table by 'treat'---------
## 
## ___________________________________________ 
##                0           1      p.overall 
##              N=706       N=294              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age       67.7 (9.33) 65.8 (9.05)   0.002   
## sex:                                0.002   
##     0     192 (27.2%) 52 (17.7%)            
##     1     514 (72.8%) 242 (82.3%)           
## weight    70.3 (15.2) 68.8 (14.6)   0.143   
## height    162 (9.00)  157 (8.29)   <0.001   
## smoking:                           <0.001   
##     0     620 (87.8%) 232 (78.9%)           
##     1     86 (12.2%)  62 (21.1%)            
## activity:                           0.152   
##     0     332 (47.0%) 123 (41.8%)           
##     1     374 (53.0%) 171 (58.2%)           
## heart:                              0.301   
##     0     621 (88.0%) 266 (90.5%)           
##     1     85 (12.0%)  28 (9.52%)            
## hypert:                             0.059   
##     0     358 (50.7%) 169 (57.5%)           
##     1     348 (49.3%) 125 (42.5%)           
## diabetes:                           0.004   
##     0     602 (85.3%) 271 (92.2%)           
##     1     104 (14.7%) 23 (7.82%)            
## cancer:                             0.288   
##     0     586 (83.0%) 235 (79.9%)           
##     1     120 (17.0%) 59 (20.1%)            
## lung:                               0.852   
##     0     610 (86.4%) 252 (85.7%)           
##     1     96 (13.6%)  42 (14.3%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Việc 3. Tính Propensity score

3.1 Xây dựng mô hình logistic

m_ps= glm(treat~ age+ sex+ weight+ height+ smoking+ activity+ heart+ hypert+ diabetes+ cancer+ lung, family= binomial(), data= ps)
summary(m_ps)

## 
## Call:
## glm(formula = treat ~ age + sex + weight + height + smoking + 
##     activity + heart + hypert + diabetes + cancer + lung, family = binomial(), 
##     data = ps)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 19.791727   2.311957   8.561  < 2e-16 ***
## age         -0.027460   0.008867  -3.097 0.001955 ** 
## sex1        -0.960857   0.256015  -3.753 0.000175 ***
## weight       0.021321   0.006436   3.313 0.000923 ***
## height      -0.123487   0.013423  -9.200  < 2e-16 ***
## smoking1     0.624629   0.208107   3.001 0.002687 ** 
## activity1    0.234277   0.153563   1.526 0.127106    
## heart1      -0.172813   0.255176  -0.677 0.498259    
## hypert1     -0.292339   0.155874  -1.875 0.060726 .  
## diabetes1   -0.745411   0.268999  -2.771 0.005587 ** 
## cancer1      0.447509   0.192421   2.326 0.020036 *  
## lung1        0.058243   0.220879   0.264 0.792020    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1211.4  on 999  degrees of freedom
## Residual deviance: 1074.4  on 988  degrees of freedom
## AIC: 1098.4
## 
## Number of Fisher Scoring iterations: 4

ps$p.score = predict(m_ps, type = "response")
head(ps)

##   id  age sex weight height smoking activity heart hypert diabetes cancer lung
## 1  1 57.0   1   99.0  165.0       0        0     0      1        0      1    1
## 2  2 57.0   1   80.9  160.0       1        1     0      1        0      1    1
## 3  3 76.5   0   75.7  175.9       0        0     0      1        0      1    0
## 4  4 72.0   1   59.0  151.0       0        1     0      0        0      0    0
## 5  5 85.5   0   55.5  157.0       0        0     0      0        0      1    0
## 6  6 48.0   1   94.0  158.5       0        0     0      1        0      0    0
##   treat death death.c    p.score
## 1     1     0       0 0.31315211
## 2     1     0       0 0.57566536
## 3     0     0       0 0.09439966
## 4     1     0       0 0.42543818
## 5     0     1       1 0.42246799
## 6     1     0       0 0.41387229

3.2 Kiểm tra phân bố của Propensity score

Giá trị trung bình

des.2 = compareGroups(treat ~ p.score + age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps)
createTable(des.2)

## 
## --------Summary descriptives table by 'treat'---------
## 
## ___________________________________________ 
##                0           1      p.overall 
##              N=706       N=294              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## p.score   0.25 (0.15) 0.39 (0.17)  <0.001   
## age       67.7 (9.33) 65.8 (9.05)   0.002   
## sex:                                0.002   
##     0     192 (27.2%) 52 (17.7%)            
##     1     514 (72.8%) 242 (82.3%)           
## weight    70.3 (15.2) 68.8 (14.6)   0.143   
## height    162 (9.00)  157 (8.29)   <0.001   
## smoking:                           <0.001   
##     0     620 (87.8%) 232 (78.9%)           
##     1     86 (12.2%)  62 (21.1%)            
## activity:                           0.152   
##     0     332 (47.0%) 123 (41.8%)           
##     1     374 (53.0%) 171 (58.2%)           
## heart:                              0.301   
##     0     621 (88.0%) 266 (90.5%)           
##     1     85 (12.0%)  28 (9.52%)            
## hypert:                             0.059   
##     0     358 (50.7%) 169 (57.5%)           
##     1     348 (49.3%) 125 (42.5%)           
## diabetes:                           0.004   
##     0     602 (85.3%) 271 (92.2%)           
##     1     104 (14.7%) 23 (7.82%)            
## cancer:                             0.288   
##     0     586 (83.0%) 235 (79.9%)           
##     1     120 (17.0%) 59 (20.1%)            
## lung:                               0.852   
##     0     610 (86.4%) 252 (85.7%)           
##     1     96 (13.6%)  42 (14.3%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Biểu đồ

ggplot(aes(x= p.score), data = ps) + geom_histogram(color= "white")+ facet_wrap(~ treat) + xlab("Xác suất nhận can thiệp")+ theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ps$treat.c[ps$treat == 1] = "Treated"
ps$treat.c[ps$treat == 0] = "Control"

ggplot(data = ps, aes(x = p.score, fill = treat.c)) + geom_density(alpha = 0.7) + labs(x = "Propensity score", y = "", title = "Phân bố Propensity score theo nhóm can thiệp") + theme_minimal()

Việc 4. Phân tích Propensity score

4.1 Bắt cặp

match.out = matchit(treat~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps, method = "nearest", ratio = 1)
summary(match.out, standardize = TRUE)

## 
## Call:
## matchit(formula = treat ~ age + sex + weight + height + smoking + 
##     activity + heart + hypert + diabetes + cancer + lung, data = ps, 
##     method = "nearest", ratio = 1)
## 
## Summary of Balance for All Data:
##           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance         0.3885        0.2546          0.7873     1.3359    0.2242
## age             65.7755       67.7252         -0.2154     0.9413    0.0461
## sex0             0.1769        0.2720         -0.2492          .    0.0951
## sex1             0.8231        0.7280          0.2492          .    0.0951
## weight          68.7646       70.2694         -0.1031     0.9272    0.0266
## height         156.7565      161.8436         -0.6133     0.8486    0.1391
## smoking0         0.7891        0.8782         -0.2183          .    0.0891
## smoking1         0.2109        0.1218          0.2183          .    0.0891
## activity0        0.4184        0.4703         -0.1052          .    0.0519
## activity1        0.5816        0.5297          0.1052          .    0.0519
## heart0           0.9048        0.8796          0.0857          .    0.0252
## heart1           0.0952        0.1204         -0.0857          .    0.0252
## hypert0          0.5748        0.5071          0.1370          .    0.0677
## hypert1          0.4252        0.4929         -0.1370          .    0.0677
## diabetes0        0.9218        0.8527          0.2572          .    0.0691
## diabetes1        0.0782        0.1473         -0.2572          .    0.0691
## cancer0          0.7993        0.8300         -0.0767          .    0.0307
## cancer1          0.2007        0.1700          0.0767          .    0.0307
## lung0            0.8571        0.8640         -0.0197          .    0.0069
## lung1            0.1429        0.1360          0.0197          .    0.0069
##           eCDF Max
## distance    0.3245
## age         0.1214
## sex0        0.0951
## sex1        0.0951
## weight      0.0621
## height      0.2586
## smoking0    0.0891
## smoking1    0.0891
## activity0   0.0519
## activity1   0.0519
## heart0      0.0252
## heart1      0.0252
## hypert0     0.0677
## hypert1     0.0677
## diabetes0   0.0691
## diabetes1   0.0691
## cancer0     0.0307
## cancer1     0.0307
## lung0       0.0069
## lung1       0.0069
## 
## Summary of Balance for Matched Data:
##           Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance         0.3885        0.3637          0.1461     1.4474    0.0221
## age             65.7755       65.9762         -0.0222     0.8747    0.0263
## sex0             0.1769        0.1803         -0.0089          .    0.0034
## sex1             0.8231        0.8197          0.0089          .    0.0034
## weight          68.7646       68.2112          0.0379     0.8734    0.0196
## height         156.7565      157.3483         -0.0714     1.0946    0.0172
## smoking0         0.7891        0.8231         -0.0834          .    0.0340
## smoking1         0.2109        0.1769          0.0834          .    0.0340
## activity0        0.4184        0.4116          0.0138          .    0.0068
## activity1        0.5816        0.5884         -0.0138          .    0.0068
## heart0           0.9048        0.8912          0.0463          .    0.0136
## heart1           0.0952        0.1088         -0.0463          .    0.0136
## hypert0          0.5748        0.5714          0.0069          .    0.0034
## hypert1          0.4252        0.4286         -0.0069          .    0.0034
## diabetes0        0.9218        0.9286         -0.0253          .    0.0068
## diabetes1        0.0782        0.0714          0.0253          .    0.0068
## cancer0          0.7993        0.7993          0.0000          .    0.0000
## cancer1          0.2007        0.2007          0.0000          .    0.0000
## lung0            0.8571        0.8639         -0.0194          .    0.0068
## lung1            0.1429        0.1361          0.0194          .    0.0068
##           eCDF Max Std. Pair Dist.
## distance    0.1293          0.1471
## age         0.0782          1.1344
## sex0        0.0034          0.7042
## sex1        0.0034          0.7042
## weight      0.0884          1.1813
## height      0.0612          0.8181
## smoking0    0.0340          0.7004
## smoking1    0.0340          0.7004
## activity0   0.0068          0.9929
## activity1   0.0068          0.9929
## heart0      0.0136          0.6025
## heart1      0.0136          0.6025
## hypert0     0.0034          1.0252
## hypert1     0.0034          1.0252
## diabetes0   0.0068          0.5067
## diabetes1   0.0068          0.5067
## cancer0     0.0000          0.2653
## cancer1     0.0000          0.2653
## lung0       0.0068          0.6610
## lung1       0.0068          0.6610
## 
## Sample Sizes:
##           Control Treated
## All           706     294
## Matched       294     294
## Unmatched     412       0
## Discarded       0       0

4.1.1 Phân bố PS theo nhóm can thiệp

plot(match.out, type = "jitter")

## To identify the units, use first mouse button; to stop, use second.

plot(match.out, type = "hist")

4.1.2 Đặc điểm 2 nhóm sau bắt cặp

ps.matched = match.data(match.out)
dim(ps.matched)

## [1] 588  20

des.3 = compareGroups(treat ~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps.matched)
createTable(des.3)

## 
## --------Summary descriptives table by 'treat'---------
## 
## ___________________________________________ 
##                0           1      p.overall 
##              N=294       N=294              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## age       66.0 (9.68) 65.8 (9.05)   0.795   
## sex:                                1.000   
##     0     53 (18.0%)  52 (17.7%)            
##     1     241 (82.0%) 242 (82.3%)           
## weight    68.2 (15.6) 68.8 (14.6)   0.657   
## height    157 (7.93)  157 (8.29)    0.377   
## smoking:                            0.348   
##     0     242 (82.3%) 232 (78.9%)           
##     1     52 (17.7%)  62 (21.1%)            
## activity:                           0.933   
##     0     121 (41.2%) 123 (41.8%)           
##     1     173 (58.8%) 171 (58.2%)           
## heart:                              0.683   
##     0     262 (89.1%) 266 (90.5%)           
##     1     32 (10.9%)  28 (9.52%)            
## hypert:                             1.000   
##     0     168 (57.1%) 169 (57.5%)           
##     1     126 (42.9%) 125 (42.5%)           
## diabetes:                           0.875   
##     0     273 (92.9%) 271 (92.2%)           
##     1     21 (7.14%)  23 (7.82%)            
## cancer:                             1.000   
##     0     235 (79.9%) 235 (79.9%)           
##     1     59 (20.1%)  59 (20.1%)            
## lung:                               0.905   
##     0     254 (86.4%) 252 (85.7%)           
##     1     40 (13.6%)  42 (14.3%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

So sánh khác biệt chuẩn hóa giữa 2 nhóm sau bắt cặp (Love plot)

library(cobalt)

##  cobalt (Version 4.6.1, Build Date: 2025-08-20)

## 
## Attaching package: 'cobalt'

## The following object is masked from 'package:MatchIt':
## 
##     lalonde

love.plot(match.out,
          stats = "mean.diffs",
          threshold = 0.1,
          abs = TRUE,
          stars = "raw",  
          var.order = "unadjusted", 
          sample.names = c("Chưa bắt cặp", "Bắt cặp"))

4.1.3 Phân tích Propensity score

Cơ bản

createTable(compareGroups(treat ~ death.c, data = ps.matched))

## 
## --------Summary descriptives table by 'treat'---------
## 
## __________________________________________ 
##               0           1      p.overall 
##             N=294       N=294              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## death.c:                           1.000   
##     0    247 (84.0%) 246 (83.7%)           
##     1    47 (16.0%)  48 (16.3%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

psa.matched = glm(death ~ treat.c, family = binomial(), data = ps.matched)
summary(psa.matched)

## 
## Call:
## glm(formula = death ~ treat.c, family = binomial(), data = ps.matched)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.65924    0.15914 -10.426   <2e-16 ***
## treat.cTreated  0.02511    0.22411   0.112    0.911    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 520.09  on 587  degrees of freedom
## Residual deviance: 520.08  on 586  degrees of freedom
## AIC: 524.08
## 
## Number of Fisher Scoring iterations: 3

# Unmatched analysis:
createTable(compareGroups(treat ~ death.c, data = ps))

## 
## --------Summary descriptives table by 'treat'---------
## 
## __________________________________________ 
##               0           1      p.overall 
##             N=706       N=294              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## death.c:                           0.161   
##     0    562 (79.6%) 246 (83.7%)           
##     1    144 (20.4%) 48 (16.3%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

unadjusted = glm(death ~ treat.c, family = binomial(), data = ps)
summary(unadjusted)

## 
## Call:
## glm(formula = death ~ treat.c, family = binomial(), data = ps)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.3617     0.0934 -14.579   <2e-16 ***
## treat.cTreated  -0.2724     0.1834  -1.486    0.137    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 978.22  on 999  degrees of freedom
## Residual deviance: 975.95  on 998  degrees of freedom
## AIC: 979.95
## 
## Number of Fisher Scoring iterations: 4

Nhiều thông tin hơn (dùng gói ‘lessR’)

library(lessR)

## Warning: package 'lessR' was built under R version 4.3.3

## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following objects are masked from 'package:dplyr':
## 
##     recode, rename

Logit(death ~ treat, data = ps.matched)

## 
## Response Variable:   death
## Predictor Variable 1:  treat
## 
## Number of cases (rows) of data:  588 
## Number of cases retained for analysis:  588 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of death for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -1.6592     0.1591  -10.426    0.000     -1.9711     -1.3473 
##       treat    0.0251     0.2241    0.112    0.911     -0.4141      0.4643 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.1903      0.1393      0.2599 
##       treat      1.0254      0.6609      1.5910 
## 
## 
## -- Model Fit
## 
##     Null deviance: 520.092 on 587 degrees of freedom
## Residual deviance: 520.080 on 586 degrees of freedom
## 
## AIC: 524.0797 
## 
## Number of iterations to convergence: 3 
## 
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 588 cases (rows) of data]
## --------------------------------------------------------------------
##     treat death P(Y=1) residual rstudent dffits    cooks
## 5       0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 27      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 28      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 29      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 71      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 74      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 89      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 92      0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 100     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 121     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 128     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 134     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 136     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 149     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 155     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 216     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 234     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 245     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 249     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 255     0     1 0.1599   0.8401     1.92 0.1193 0.008999
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##    treat death label fitted std.err
## 5      0     1     0 0.1599 0.02137
## 7      0     1     0 0.1599 0.02137
## 12     0     0     0 0.1599 0.02137
## 13     0     0     0 0.1599 0.02137
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##     treat death label fitted std.err
## 985     0     0     0 0.1599 0.02137
## 989     0     0     0 0.1599 0.02137
## 1       1     0     0 0.1633 0.02156
## 2       1     0     0 0.1633 0.02156
## 4       1     0     0 0.1633 0.02156
## 
## ... for the last 4 rows of sorted data ...
## 
##     treat death label fitted std.err
## 992     1     0     0 0.1633 0.02156
## 993     1     0     0 0.1633 0.02156
## 995     1     1     0 0.1633 0.02156
## 999     1     1     0 0.1633 0.02156
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## Corresponding cutoff threshold for treat: 66.078
## 
##               Baseline         Predicted 
## ---------------------------------------------------
##              Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##         1       95  16.2       95      0     0.0 
## death   0      493  83.8      493      0     100.0 
## ---------------------------------------------------
##       Total    588                           83.8 
## 
## Accuracy: 83.84 
## Sensitivity: 0.00 
## Precision: NaN

Logit(death ~ treat, data = ps)

## 
## Response Variable:   death
## Predictor Variable 1:  treat
## 
## Number of cases (rows) of data:  1000 
## Number of cases retained for analysis:  1000 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of death for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -1.3617     0.0934  -14.579    0.000     -1.5448     -1.1786 
##       treat   -0.2724     0.1834   -1.486    0.137     -0.6318      0.0869 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.2562      0.2134      0.3077 
##       treat      0.7615      0.5316      1.0908 
## 
## 
## -- Model Fit
## 
##     Null deviance: 978.220 on 999 degrees of freedom
## Residual deviance: 975.949 on 998 degrees of freedom
## 
## AIC: 979.9489 
## 
## Number of iterations to convergence: 4 
## 
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 1000 cases (rows) of data]
## --------------------------------------------------------------------
##     treat death P(Y=1) residual rstudent dffits    cooks
## 17      1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 53      1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 62      1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 111     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 116     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 147     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 183     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 189     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 197     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 201     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 225     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 252     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 263     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 266     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 288     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 308     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 316     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 343     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 346     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 384     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##   treat death label fitted std.err
## 1     1     0     0 0.1633 0.02156
## 2     1     0     0 0.1633 0.02156
## 4     1     0     0 0.1633 0.02156
## 6     1     0     0 0.1633 0.02156
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##     treat death label fitted std.err
## 995     1     1     0 0.1633 0.02156
## 999     1     1     0 0.1633 0.02156
## 3       0     0     0 0.2040 0.01516
## 5       0     1     0 0.2040 0.01516
## 7       0     1     0 0.2040 0.01516
## 
## ... for the last 4 rows of sorted data ...
## 
##      treat death label fitted std.err
## 996      0     0     0  0.204 0.01516
## 997      0     0     0  0.204 0.01516
## 998      0     0     0  0.204 0.01516
## 1000     0     1     0  0.204 0.01516
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## Corresponding cutoff threshold for treat: -4.998
## 
##               Baseline         Predicted 
## ---------------------------------------------------
##              Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##         1      192  19.2      192      0     0.0 
## death   0      808  80.8      808      0     100.0 
## ---------------------------------------------------
##       Total   1000                           80.8 
## 
## Accuracy: 80.80 
## Sensitivity: 0.00 
## Precision: NaN

4.2 Hiệu chỉnh

4.2.1 Đặc điểm mẫu nghiên cứu

des.2 = compareGroups(treat ~ p.score + age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps)
createTable(des.2)

## 
## --------Summary descriptives table by 'treat'---------
## 
## ___________________________________________ 
##                0           1      p.overall 
##              N=706       N=294              
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## p.score   0.25 (0.15) 0.39 (0.17)  <0.001   
## age       67.7 (9.33) 65.8 (9.05)   0.002   
## sex:                                0.002   
##     0     192 (27.2%) 52 (17.7%)            
##     1     514 (72.8%) 242 (82.3%)           
## weight    70.3 (15.2) 68.8 (14.6)   0.143   
## height    162 (9.00)  157 (8.29)   <0.001   
## smoking:                           <0.001   
##     0     620 (87.8%) 232 (78.9%)           
##     1     86 (12.2%)  62 (21.1%)            
## activity:                           0.152   
##     0     332 (47.0%) 123 (41.8%)           
##     1     374 (53.0%) 171 (58.2%)           
## heart:                              0.301   
##     0     621 (88.0%) 266 (90.5%)           
##     1     85 (12.0%)  28 (9.52%)            
## hypert:                             0.059   
##     0     358 (50.7%) 169 (57.5%)           
##     1     348 (49.3%) 125 (42.5%)           
## diabetes:                           0.004   
##     0     602 (85.3%) 271 (92.2%)           
##     1     104 (14.7%) 23 (7.82%)            
## cancer:                             0.288   
##     0     586 (83.0%) 235 (79.9%)           
##     1     120 (17.0%) 59 (20.1%)            
## lung:                               0.852   
##     0     610 (86.4%) 252 (85.7%)           
##     1     96 (13.6%)  42 (14.3%)            
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

4.2.2 Hiệu chỉnh

psa.adj = glm(death ~ treat + p.score, family = binomial(), data = ps)
summary(psa.adj)

## 
## Call:
## glm(formula = death ~ treat + p.score, family = binomial(), data = ps)
## 
## Coefficients:
##             Estimate Std. Error z value      Pr(>|z|)    
## (Intercept) -0.92027    0.15967  -5.764 0.00000000823 ***
## treat       -0.04003    0.19664  -0.204       0.83869    
## p.score     -1.81538    0.55690  -3.260       0.00111 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 978.22  on 999  degrees of freedom
## Residual deviance: 964.82  on 997  degrees of freedom
## AIC: 970.82
## 
## Number of Fisher Scoring iterations: 4

unadjusted = glm(death ~ treat, family = binomial(), data = ps)
summary(unadjusted)

## 
## Call:
## glm(formula = death ~ treat, family = binomial(), data = ps)
## 
## Coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)  -1.3617     0.0934 -14.579 <0.0000000000000002 ***
## treat        -0.2724     0.1834  -1.486               0.137    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 978.22  on 999  degrees of freedom
## Residual deviance: 975.95  on 998  degrees of freedom
## AIC: 979.95
## 
## Number of Fisher Scoring iterations: 4

Gói ‘lessR’

Logit(death ~ treat + p.score, data = ps)

## 
## Response Variable:   death
## Predictor Variable 1:  treat
## Predictor Variable 2:  p.score
## 
## Number of cases (rows) of data:  1000 
## Number of cases retained for analysis:  1000 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of death for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -0.9203     0.1597   -5.764    0.000     -1.2332     -0.6073 
##       treat   -0.0400     0.1966   -0.204    0.839     -0.4254      0.3454 
##     p.score   -1.8154     0.5569   -3.260    0.001     -2.9069     -0.7239 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.3984      0.2914      0.5448 
##       treat      0.9608      0.6535      1.4125 
##     p.score      0.1628      0.0546      0.4849 
## 
## 
## -- Model Fit
## 
##     Null deviance: 978.220 on 999 degrees of freedom
## Residual deviance: 964.820 on 997 degrees of freedom
## 
## AIC: 970.8195 
## 
## Number of iterations to convergence: 4 
## 
## 
## Collinearity
## 
##         Tolerance       VIF
## treat       0.864     1.157
## p.score     0.864     1.157
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 1000 cases (rows) of data]
## --------------------------------------------------------------------
##     treat p.score death  P(Y=1) residual rstudent dffits    cooks
## 883     1 0.83190     1 0.07795   0.9220    2.277 0.1925 0.027573
## 189     1 0.75148     1 0.08911   0.9109    2.213 0.1748 0.020753
## 27      0 0.64782     1 0.10946   0.8905    2.115 0.1682 0.016716
## 540     1 0.69201     1 0.09827   0.9017    2.166 0.1616 0.016598
## 134     0 0.64417     1 0.11010   0.8899    2.112 0.1671 0.016439
## 111     1 0.68003     1 0.10022   0.8998    2.156 0.1590 0.015850
## 886     1 0.63222     1 0.10832   0.8917    2.118 0.1487 0.013155
## 960     1 0.62008     1 0.11047   0.8895    2.108 0.1461 0.012544
## 385     1 0.60668     1 0.11288   0.8871    2.097 0.1434 0.011902
## 249     0 0.57625     1 0.12278   0.8772    2.057 0.1469 0.011810
## 587     0 0.56179     1 0.12563   0.8744    2.045 0.1425 0.010949
## 343     1 0.57700     1 0.11839   0.8816    2.073 0.1375 0.010606
## 62      1 0.03723     1 0.26349   0.7365    1.643 0.1757 0.010421
## 858     1 0.04755     1 0.25988   0.7401    1.651 0.1726 0.010141
## 922     0 0.53152     1 0.13180   0.8682    2.020 0.1333 0.009288
## 988     1 0.09513     1 0.24361   0.7564    1.688 0.1588 0.008930
## 521     1 0.09665     1 0.24310   0.7569    1.690 0.1584 0.008894
## 252     1 0.09988     1 0.24202   0.7580    1.692 0.1575 0.008817
## 995     1 0.51617     1 0.13041   0.8696    2.025 0.1268 0.008466
## 444     1 0.51368     1 0.13092   0.8691    2.023 0.1265 0.008393
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##     treat p.score death label  fitted std.err
## 86      1  0.8509     0     0 0.07551 0.02226
## 883     1  0.8319     1     0 0.07795 0.02226
## 564     1  0.8153     0     0 0.08015 0.02225
## 282     1  0.8118     0     0 0.08062 0.02225
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##      treat  p.score death label fitted std.err
## 845      0 0.027726     1     0 0.2748 0.02938
## 716      0 0.025445     0     0 0.2756 0.02963
## 345      0 0.008018     1     0 0.2819 0.03160
## NA      NA       NA    NA    NA     NA      NA
## NA.1    NA       NA    NA    NA     NA      NA
## 
## ... for the last 4 rows of sorted data ...
## 
##     treat  p.score death label fitted std.err
## 434     0 0.031032     0     0 0.2736 0.02902
## 845     0 0.027726     1     0 0.2748 0.02938
## 716     0 0.025445     0     0 0.2756 0.02963
## 345     0 0.008018     1     0 0.2819 0.03160
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## 
##               Baseline         Predicted 
## ---------------------------------------------------
##              Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##         1      192  19.2      192      0     0.0 
## death   0      808  80.8      808      0     100.0 
## ---------------------------------------------------
##       Total   1000                           80.8 
## 
## Accuracy: 80.80 
## Sensitivity: 0.00 
## Precision: NaN

Logit(death ~ treat, data = ps)

## 
## Response Variable:   death
## Predictor Variable 1:  treat
## 
## Number of cases (rows) of data:  1000 
## Number of cases retained for analysis:  1000 
## 
## 
##    BASIC ANALYSIS 
## 
## -- Estimated Model of death for the Logit of Reference Group Membership
## 
##              Estimate    Std Err  z-value  p-value   Lower 95%   Upper 95%
## (Intercept)   -1.3617     0.0934  -14.579    0.000     -1.5448     -1.1786 
##       treat   -0.2724     0.1834   -1.486    0.137     -0.6318      0.0869 
## 
## 
## -- Odds Ratios and Confidence Intervals
## 
##              Odds Ratio   Lower 95%   Upper 95%
## (Intercept)      0.2562      0.2134      0.3077 
##       treat      0.7615      0.5316      1.0908 
## 
## 
## -- Model Fit
## 
##     Null deviance: 978.220 on 999 degrees of freedom
## Residual deviance: 975.949 on 998 degrees of freedom
## 
## AIC: 979.9489 
## 
## Number of iterations to convergence: 4 
## 
## 
##    ANALYSIS OF RESIDUALS AND INFLUENCE 
## Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
##    [sorted by Cook's Distance]
##    [res_rows = 20 out of 1000 cases (rows) of data]
## --------------------------------------------------------------------
##     treat death P(Y=1) residual rstudent dffits    cooks
## 17      1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 53      1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 62      1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 111     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 116     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 147     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 183     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 189     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 197     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 201     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 225     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 252     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 263     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 266     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 288     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 308     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 316     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 343     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 346     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 384     1     1 0.1633   0.8367    1.908 0.1128 0.008776
## 
## 
##    PREDICTION 
## 
## Probability threshold for classification : 0.5
## 
## 
## Data, Fitted Values, Standard Errors
##    [sorted by fitted value]
##    [pred_all=TRUE to see all intervals displayed]
## --------------------------------------------------------------------
##   treat death label fitted std.err
## 1     1     0     0 0.1633 0.02156
## 2     1     0     0 0.1633 0.02156
## 4     1     0     0 0.1633 0.02156
## 6     1     0     0 0.1633 0.02156
## 
## ... for the rows of data where fitted is close to 0.5 ...
## 
##     treat death label fitted std.err
## 995     1     1     0 0.1633 0.02156
## 999     1     1     0 0.1633 0.02156
## 3       0     0     0 0.2040 0.01516
## 5       0     1     0 0.2040 0.01516
## 7       0     1     0 0.2040 0.01516
## 
## ... for the last 4 rows of sorted data ...
## 
##      treat death label fitted std.err
## 996      0     0     0  0.204 0.01516
## 997      0     0     0  0.204 0.01516
## 998      0     0     0  0.204 0.01516
## 1000     0     1     0  0.204 0.01516
## --------------------------------------------------------------------
## 
## 
## ----------------------------
## Specified confusion matrices
## ----------------------------
## 
## Probability threshold for predicting : 0.5
## Corresponding cutoff threshold for treat: -4.998
## 
##               Baseline         Predicted 
## ---------------------------------------------------
##              Total  %Tot        0      1  %Correct 
## ---------------------------------------------------
##         1      192  19.2      192      0     0.0 
## death   0      808  80.8      808      0     100.0 
## ---------------------------------------------------
##       Total   1000                           80.8 
## 
## Accuracy: 80.80 
## Sensitivity: 0.00 
## Precision: NaN

Việc 5. ChatGPT

PROMPT (10/12/2025): Tôi dự kiến thực hiện phân tích propensity score (Propensity Score Analysis) để so sánh tỉ lệ tử vong (biến “death”) giữa 2 nhóm can thiệp (“treat”). Bạn cung cấp R codes để (i) xây dựng mô hình logistic tiên lượng xác suất thuộc nhóm can thiệp (“treat”) với các biến số gồm age, sex, weight, height, smoking, activity, heart, hypert, diabetes, cancer, lung; (ii) ước tính propensity score cho mỗi cá nhân trong nghiên cứu; (iii) bắt cặp bằng propensity score và so sánh tỉ lệ tử vong (“death”) giữa 2 nhóm bằng hồi qui logistic; và (iv) so sánh tỉ lệ tử vong bằng hồi qui logistic có hiệu chỉnh cho propensity score trong toàn mẫu nghiên cứu

✅ (i) Xây dựng mô hình logistic dự đoán xác suất thuộc nhóm can thiệp (“treat”)

data = read.csv("C:\\Thach\\VN trips\\2026_1Jan\\Data Analysis Workshop_PN Institute\\Data\\ps_data.csv")

# Mô hình logistic để tiên lượng xác suất thuộc nhóm can thiệp
ps_model <- glm(treat ~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = data, family = binomial)

summary(ps_model)

## 
## Call:
## glm(formula = treat ~ age + sex + weight + height + smoking + 
##     activity + heart + hypert + diabetes + cancer + lung, family = binomial, 
##     data = data)
## 
## Coefficients:
##              Estimate Std. Error z value             Pr(>|z|)    
## (Intercept) 19.791727   2.311957   8.561 < 0.0000000000000002 ***
## age         -0.027460   0.008867  -3.097             0.001955 ** 
## sex         -0.960857   0.256015  -3.753             0.000175 ***
## weight       0.021321   0.006436   3.313             0.000923 ***
## height      -0.123487   0.013423  -9.200 < 0.0000000000000002 ***
## smoking      0.624629   0.208107   3.001             0.002687 ** 
## activity     0.234277   0.153563   1.526             0.127106    
## heart       -0.172813   0.255176  -0.677             0.498259    
## hypert      -0.292339   0.155874  -1.875             0.060726 .  
## diabetes    -0.745411   0.268999  -2.771             0.005587 ** 
## cancer       0.447509   0.192421   2.326             0.020036 *  
## lung         0.058243   0.220879   0.264             0.792020    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1211.4  on 999  degrees of freedom
## Residual deviance: 1074.4  on 988  degrees of freedom
## AIC: 1098.4
## 
## Number of Fisher Scoring iterations: 4

✅ (ii) Ước tính propensity score cho mỗi cá nhân

# Ước tính propensity score
data$pscore <- predict(ps_model, type = "response")

# Kiểm tra tóm tắt
summary(data$pscore)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.008018 0.154494 0.269406 0.294000 0.413508 0.850851

✅ (iii) Bắt cặp (matching) theo propensity score + phân tích tỉ lệ tử vong bằng logistic regression Ở đây sử dụng gói MatchIt.

library(MatchIt)

# Thực hiện nearest-neighbor matching 1:1 theo propensity score
m.out <- matchit(treat ~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = data, method = "nearest", distance = data$pscore)

# Dataset sau khi matching
matched.data <- match.data(m.out)

➤ Hồi qui logistic trên dữ liệu đã matching

# Logistic regression trên mẫu đã matched
model_matched <- glm(death ~ treat, data = matched.data, family = binomial)

summary(model_matched)

## 
## Call:
## glm(formula = death ~ treat, family = binomial, data = matched.data)
## 
## Coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept) -1.65924    0.15914 -10.426 <0.0000000000000002 ***
## treat        0.02511    0.22411   0.112               0.911    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 520.09  on 587  degrees of freedom
## Residual deviance: 520.08  on 586  degrees of freedom
## AIC: 524.08
## 
## Number of Fisher Scoring iterations: 3

✅ (iv) Phân tích toàn mẫu, điều chỉnh bằng propensity score

Có 2 cách phổ biến:

A. Adjusting by Covariate (PS as covariate)

model_adjust_ps <- glm(death ~ treat + pscore, data = data, family = binomial)
summary(model_adjust_ps)

## 
## Call:
## glm(formula = death ~ treat + pscore, family = binomial, data = data)
## 
## Coefficients:
##             Estimate Std. Error z value      Pr(>|z|)    
## (Intercept) -0.92027    0.15967  -5.764 0.00000000823 ***
## treat       -0.04003    0.19664  -0.204       0.83869    
## pscore      -1.81538    0.55690  -3.260       0.00111 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 978.22  on 999  degrees of freedom
## Residual deviance: 964.82  on 997  degrees of freedom
## AIC: 970.82
## 
## Number of Fisher Scoring iterations: 4

B. PS stratification (tuỳ chọn)

# Chia PS thành 5 quintiles

data$ps_quintile <- cut(data$pscore, breaks = quantile(data$pscore, probs = seq(0,1,0.2)), include.lowest = TRUE)
model_strata <- glm(death ~ treat + ps_quintile, data = data, family = binomial)
summary(model_strata)

## 
## Call:
## glm(formula = death ~ treat + ps_quintile, family = binomial, 
##     data = data)
## 
## Coefficients:
##                          Estimate Std. Error z value     Pr(>|z|)    
## (Intercept)              -0.84649    0.15532  -5.450 0.0000000504 ***
## treat                    -0.00899    0.19779  -0.045     0.963748    
## ps_quintile(0.136,0.226] -0.53824    0.23524  -2.288     0.022138 *  
## ps_quintile(0.226,0.32]  -0.80896    0.25078  -3.226     0.001256 ** 
## ps_quintile(0.32,0.444]  -0.73618    0.24795  -2.969     0.002987 ** 
## ps_quintile(0.444,0.851] -1.04940    0.27712  -3.787     0.000153 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 978.22  on 999  degrees of freedom
## Residual deviance: 957.28  on 994  degrees of freedom
## AIC: 969.28
## 
## Number of Fisher Scoring iterations: 4

PROMPT: Phân tích phân tầng của propensity score (stratified PSA) không phải là hiệu chỉnh biến Propensity score ở dạng quintiles (thay vì continuous) mà cần ước tính ảnh hưởng can thiệp trong từng quintile sau đó ước tính hiệu quả tổng hợp sau phân tầng.

ChatGPT: Bạn hoàn toàn đúng — Stratified PSA ≠ đưa quintile vào mô hình như một biến điều chỉnh. Phân tích đúng phải gồm 2 bước: (i) Ước tính hiệu quả can thiệp riêng trong từng tầng (quintile) (ii) Tổng hợp hiệu quả từ các tầng theo phương pháp meta-analysis (inverse‐variance weighting). Dưới đây là R code hoàn toàn đúng theo chuẩn Stratified Propensity Score Analysis.

✅ Bước 1 — Chia propensity score thành 5 quintiles

✅ Bước 2 — Ước tính ảnh hưởng can thiệp trong TỪNG quintile

✅ Bước 3 — Ước tính hiệu quả tổng hợp sau phân tầng

Data Analysis Workshop - Phuong Nam Institute

Thach Tran

2025-12-03