ps = read.csv("C:\\Thach\\VN trips\\2026_1Jan\\PN Institute\\Datasets\\ps_data.csv")
dim(ps)
## [1] 1000 14
head(ps)
## id age sex weight height smoking activity heart hypert diabetes cancer lung
## 1 1 57.0 1 99.0 165.0 0 0 0 1 0 1 1
## 2 2 57.0 1 80.9 160.0 1 1 0 1 0 1 1
## 3 3 76.5 0 75.7 175.9 0 0 0 1 0 1 0
## 4 4 72.0 1 59.0 151.0 0 1 0 0 0 0 0
## 5 5 85.5 0 55.5 157.0 0 0 0 0 0 1 0
## 6 6 48.0 1 94.0 158.5 0 0 0 1 0 0 0
## treat death
## 1 1 0
## 2 1 0
## 3 0 0
## 4 1 0
## 5 0 1
## 6 1 0
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ as.factor(sex) + age + weight + height + as.factor(smoking) + as.factor(activity) + as.factor(heart) + as.factor(hypert) + as.factor(diabetes) + as.factor(cancer) + as.factor(lung) | treat, data = ps)
## Warning in table1.formula(~as.factor(sex) + age + weight + height +
## as.factor(smoking) + : Terms to the right of '|' in formula 'x' define table
## columns and are expected to be factors with meaningful labels.
| 0 (N=706) |
1 (N=294) |
Overall (N=1000) |
|
|---|---|---|---|
| as.factor(sex) | |||
| 0 | 192 (27.2%) | 52 (17.7%) | 244 (24.4%) |
| 1 | 514 (72.8%) | 242 (82.3%) | 756 (75.6%) |
| age | |||
| Mean (SD) | 67.7 (9.33) | 65.8 (9.05) | 67.2 (9.29) |
| Median [Min, Max] | 68.5 [50.5, 90.5] | 66.0 [45.0, 96.0] | 67.5 [45.0, 96.0] |
| weight | |||
| Mean (SD) | 70.3 (15.2) | 68.8 (14.6) | 69.8 (15.0) |
| Median [Min, Max] | 68.5 [35.5, 118] | 67.0 [37.0, 113] | 68.0 [35.5, 118] |
| height | |||
| Mean (SD) | 162 (9.00) | 157 (8.29) | 160 (9.10) |
| Median [Min, Max] | 161 [140, 188] | 156 [134, 183] | 160 [134, 188] |
| as.factor(smoking) | |||
| 0 | 620 (87.8%) | 232 (78.9%) | 852 (85.2%) |
| 1 | 86 (12.2%) | 62 (21.1%) | 148 (14.8%) |
| as.factor(activity) | |||
| 0 | 332 (47.0%) | 123 (41.8%) | 455 (45.5%) |
| 1 | 374 (53.0%) | 171 (58.2%) | 545 (54.5%) |
| as.factor(heart) | |||
| 0 | 621 (88.0%) | 266 (90.5%) | 887 (88.7%) |
| 1 | 85 (12.0%) | 28 (9.5%) | 113 (11.3%) |
| as.factor(hypert) | |||
| 0 | 358 (50.7%) | 169 (57.5%) | 527 (52.7%) |
| 1 | 348 (49.3%) | 125 (42.5%) | 473 (47.3%) |
| as.factor(diabetes) | |||
| 0 | 602 (85.3%) | 271 (92.2%) | 873 (87.3%) |
| 1 | 104 (14.7%) | 23 (7.8%) | 127 (12.7%) |
| as.factor(cancer) | |||
| 0 | 586 (83.0%) | 235 (79.9%) | 821 (82.1%) |
| 1 | 120 (17.0%) | 59 (20.1%) | 179 (17.9%) |
| as.factor(lung) | |||
| 0 | 610 (86.4%) | 252 (85.7%) | 862 (86.2%) |
| 1 | 96 (13.6%) | 42 (14.3%) | 138 (13.8%) |
m_ps= glm(treat~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, family=
binomial(), data= ps)
summary(m_ps)
##
## Call:
## glm(formula = treat ~ age + sex + weight + height + smoking +
## activity + heart + hypert + diabetes + cancer + lung, family = binomial(),
## data = ps)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 19.791727 2.311957 8.561 < 2e-16 ***
## age -0.027460 0.008867 -3.097 0.001955 **
## sex -0.960857 0.256015 -3.753 0.000175 ***
## weight 0.021321 0.006436 3.313 0.000923 ***
## height -0.123487 0.013423 -9.200 < 2e-16 ***
## smoking 0.624629 0.208107 3.001 0.002687 **
## activity 0.234277 0.153563 1.526 0.127106
## heart -0.172813 0.255176 -0.677 0.498259
## hypert -0.292339 0.155874 -1.875 0.060726 .
## diabetes -0.745411 0.268999 -2.771 0.005587 **
## cancer 0.447509 0.192421 2.326 0.020036 *
## lung 0.058243 0.220879 0.264 0.792020
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1211.4 on 999 degrees of freedom
## Residual deviance: 1074.4 on 988 degrees of freedom
## AIC: 1098.4
##
## Number of Fisher Scoring iterations: 4
ps$p.score = predict(m_ps, type = "response")
head(ps)
## id age sex weight height smoking activity heart hypert diabetes cancer lung
## 1 1 57.0 1 99.0 165.0 0 0 0 1 0 1 1
## 2 2 57.0 1 80.9 160.0 1 1 0 1 0 1 1
## 3 3 76.5 0 75.7 175.9 0 0 0 1 0 1 0
## 4 4 72.0 1 59.0 151.0 0 1 0 0 0 0 0
## 5 5 85.5 0 55.5 157.0 0 0 0 0 0 1 0
## 6 6 48.0 1 94.0 158.5 0 0 0 1 0 0 0
## treat death p.score
## 1 1 0 0.31315211
## 2 1 0 0.57566536
## 3 0 0 0.09439966
## 4 1 0 0.42543818
## 5 0 1 0.42246799
## 6 1 0 0.41387229
library(ggplot2)
ggplot(aes(x = p.score), data = ps) + geom_histogram(color = "white") + facet_wrap(~ treat) + xlab("Xác suất nhận can thiệp") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
ggplot(data = ps, aes(x = p.score, fill = as.factor(treat))) + geom_density(alpha = 0.7) + labs(x = "Propensity score", y = "", title = "Phân bố Propensity score theo nhóm can thiệp") + theme_minimal()
library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.3.3
match.out = matchit(treat~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps, method = "nearest", ratio = 1)
summary(match.out, standardize = TRUE)
##
## Call:
## matchit(formula = treat ~ age + sex + weight + height + smoking +
## activity + heart + hypert + diabetes + cancer + lung, data = ps,
## method = "nearest", ratio = 1)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.3885 0.2546 0.7873 1.3359 0.2242
## age 65.7755 67.7252 -0.2154 0.9413 0.0461
## sex 0.8231 0.7280 0.2492 . 0.0951
## weight 68.7646 70.2694 -0.1031 0.9272 0.0266
## height 156.7565 161.8436 -0.6133 0.8486 0.1391
## smoking 0.2109 0.1218 0.2183 . 0.0891
## activity 0.5816 0.5297 0.1052 . 0.0519
## heart 0.0952 0.1204 -0.0857 . 0.0252
## hypert 0.4252 0.4929 -0.1370 . 0.0677
## diabetes 0.0782 0.1473 -0.2572 . 0.0691
## cancer 0.2007 0.1700 0.0767 . 0.0307
## lung 0.1429 0.1360 0.0197 . 0.0069
## eCDF Max
## distance 0.3245
## age 0.1214
## sex 0.0951
## weight 0.0621
## height 0.2586
## smoking 0.0891
## activity 0.0519
## heart 0.0252
## hypert 0.0677
## diabetes 0.0691
## cancer 0.0307
## lung 0.0069
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.3885 0.3637 0.1461 1.4474 0.0221
## age 65.7755 65.9762 -0.0222 0.8747 0.0263
## sex 0.8231 0.8197 0.0089 . 0.0034
## weight 68.7646 68.2112 0.0379 0.8734 0.0196
## height 156.7565 157.3483 -0.0714 1.0946 0.0172
## smoking 0.2109 0.1769 0.0834 . 0.0340
## activity 0.5816 0.5884 -0.0138 . 0.0068
## heart 0.0952 0.1088 -0.0463 . 0.0136
## hypert 0.4252 0.4286 -0.0069 . 0.0034
## diabetes 0.0782 0.0714 0.0253 . 0.0068
## cancer 0.2007 0.2007 0.0000 . 0.0000
## lung 0.1429 0.1361 0.0194 . 0.0068
## eCDF Max Std. Pair Dist.
## distance 0.1293 0.1471
## age 0.0782 1.1344
## sex 0.0034 0.7042
## weight 0.0884 1.1813
## height 0.0612 0.8181
## smoking 0.0340 0.7004
## activity 0.0068 0.9929
## heart 0.0136 0.6025
## hypert 0.0034 1.0252
## diabetes 0.0068 0.5067
## cancer 0.0000 0.2653
## lung 0.0068 0.6610
##
## Sample Sizes:
## Control Treated
## All 706 294
## Matched 294 294
## Unmatched 412 0
## Discarded 0 0
library(cobalt)
## cobalt (Version 4.6.1, Build Date: 2025-08-20)
##
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
##
## lalonde
love.plot(match.out, stats = "mean.diffs", threshold = 0.1, abs = TRUE, stars = "raw", var.order = "unadjusted", sample.names = c("Chưa bắt cặp", "Bắt cặp"))
ps.matched = match.data(match.out)
table1(~ as.factor(death) | treat, data = ps.matched)
## Warning in table1.formula(~as.factor(death) | treat, data = ps.matched): Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.
| 0 (N=294) |
1 (N=294) |
Overall (N=588) |
|
|---|---|---|---|
| as.factor(death) | |||
| 0 | 247 (84.0%) | 246 (83.7%) | 493 (83.8%) |
| 1 | 47 (16.0%) | 48 (16.3%) | 95 (16.2%) |
psa.matched = glm(death ~ treat, family = binomial(), data = ps.matched)
summary(psa.matched)
##
## Call:
## glm(formula = death ~ treat, family = binomial(), data = ps.matched)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.65924 0.15914 -10.426 <2e-16 ***
## treat 0.02511 0.22411 0.112 0.911
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 520.09 on 587 degrees of freedom
## Residual deviance: 520.08 on 586 degrees of freedom
## AIC: 524.08
##
## Number of Fisher Scoring iterations: 3
library(epiDisplay)
## Warning: package 'epiDisplay' was built under R version 4.3.2
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(psa.matched)
##
## Logistic regression predicting death
##
## OR(95%CI) P(Wald's test) P(LR-test)
## treat: 1 vs 0 1.03 (0.66,1.59) 0.911 0.911
##
## Log-likelihood = -260.0398
## No. of observations = 588
## AIC value = 524.0797
psa.adj = glm(death ~ treat + p.score, family = binomial(), data = ps)
summary(psa.adj)
##
## Call:
## glm(formula = death ~ treat + p.score, family = binomial(), data = ps)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.92027 0.15967 -5.764 8.23e-09 ***
## treat -0.04003 0.19664 -0.204 0.83869
## p.score -1.81538 0.55690 -3.260 0.00111 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 978.22 on 999 degrees of freedom
## Residual deviance: 964.82 on 997 degrees of freedom
## AIC: 970.82
##
## Number of Fisher Scoring iterations: 4
logistic.display(psa.matched)
##
## Logistic regression predicting death
##
## OR(95%CI) P(Wald's test) P(LR-test)
## treat: 1 vs 0 1.03 (0.66,1.59) 0.911 0.911
##
## Log-likelihood = -260.0398
## No. of observations = 588
## AIC value = 524.0797
PROMPT: Tôi dự kiến thực hiện phân tích propensity score (Propensity Score Analysis) để so sánh tỉ lệ tử vong (biến “death”) giữa 2 nhóm can thiệp (“treat”). Bạn cung cấp R codes để (i) xây dựng mô hình logistic tiên lượng xác suất thuộc nhóm can thiệp (“treat”) với các biến số gồm age, sex, weight, height, smoking, activity, heart, hypert, diabetes, cancer, lung; (ii) ước tính propensity score cho mỗi cá nhân trong nghiên cứu; (iii) bắt cặp bằng propensity score và so sánh tỉ lệ tử vong (“death”) giữa 2 nhóm bằng hồi qui logistic; và (iv) so sánh tỉ lệ tử vong bằng hồi qui logistic có hiệu chỉnh cho propensity score trong toàn mẫu nghiên cứu