ps = read.csv ("/Users/lynguyen/Documents/6 Dao tao/Tap_huan/R 2025/Tai_lieu_R/ps_data.csv")
dim(ps)
## [1] 1000 14
##Việc 2. Mô tả đặc điểm mẫu nghiên cứu
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
##Việc 3. Tính chỉ số propensity score ###3.1 Xây dựng mô hình hồi qui logistic tiên lượng xác suất thuộc nhóm can thiệp
m_ps= glm(treat~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, family=
binomial(), data= ps)
summary(m_ps)
##
## Call:
## glm(formula = treat ~ age + sex + weight + height + smoking +
## activity + heart + hypert + diabetes + cancer + lung, family = binomial(),
## data = ps)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 19.791727 2.311957 8.561 < 2e-16 ***
## age -0.027460 0.008867 -3.097 0.001955 **
## sex -0.960857 0.256015 -3.753 0.000175 ***
## weight 0.021321 0.006436 3.313 0.000923 ***
## height -0.123487 0.013423 -9.200 < 2e-16 ***
## smoking 0.624629 0.208107 3.001 0.002687 **
## activity 0.234277 0.153563 1.526 0.127106
## heart -0.172813 0.255176 -0.677 0.498259
## hypert -0.292339 0.155874 -1.875 0.060726 .
## diabetes -0.745411 0.268999 -2.771 0.005587 **
## cancer 0.447509 0.192421 2.326 0.020036 *
## lung 0.058243 0.220879 0.264 0.792020
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1211.4 on 999 degrees of freedom
## Residual deviance: 1074.4 on 988 degrees of freedom
## AIC: 1098.4
##
## Number of Fisher Scoring iterations: 4
##3.2 Tính chỉ số propensity score cho từng cá nhân
ps$p.score = predict(m_ps, type = "response")
head(ps)
## id age sex weight height smoking activity heart hypert diabetes cancer lung
## 1 1 57.0 1 99.0 165.0 0 0 0 1 0 1 1
## 2 2 57.0 1 80.9 160.0 1 1 0 1 0 1 1
## 3 3 76.5 0 75.7 175.9 0 0 0 1 0 1 0
## 4 4 72.0 1 59.0 151.0 0 1 0 0 0 0 0
## 5 5 85.5 0 55.5 157.0 0 0 0 0 0 1 0
## 6 6 48.0 1 94.0 158.5 0 0 0 1 0 0 0
## treat death p.score
## 1 1 0 0.31315211
## 2 1 0 0.57566536
## 3 0 0 0.09439966
## 4 1 0 0.42543818
## 5 0 1 0.42246799
## 6 1 0 0.41387229
##3.3 Mô tả đặc điểm phân bố của propensity score theo nhóm can thiệp
library(ggplot2)
ggplot(aes(x = p.score), data = ps) + geom_histogram(color = "white") + facet_wrap(~ treat) + xlab("Xác suất nhận can thiệp") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
ggplot(data = ps, aes(x = p.score, fill = as.factor(treat))) + geom_density(alpha = 0.7) + labs(x = "Propensity score", y = "", title = "Phân bố Propensity score theo nhóm can thiệp") + theme_minimal()
##Việc 4. Phân tích propensity score bắt cặp ###4.1 Bắt cặp
library ("MatchIt")
match.out = matchit(treat~ age + sex + weight + height + smoking + activity + heart + hypert + diabetes + cancer + lung, data = ps, method = "nearest", ratio = 1)
summary(match.out, standardize = TRUE)
##
## Call:
## matchit(formula = treat ~ age + sex + weight + height + smoking +
## activity + heart + hypert + diabetes + cancer + lung, data = ps,
## method = "nearest", ratio = 1)
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.3885 0.2546 0.7873 1.3359 0.2242
## age 65.7755 67.7252 -0.2154 0.9413 0.0461
## sex 0.8231 0.7280 0.2492 . 0.0951
## weight 68.7646 70.2694 -0.1031 0.9272 0.0266
## height 156.7565 161.8436 -0.6133 0.8486 0.1391
## smoking 0.2109 0.1218 0.2183 . 0.0891
## activity 0.5816 0.5297 0.1052 . 0.0519
## heart 0.0952 0.1204 -0.0857 . 0.0252
## hypert 0.4252 0.4929 -0.1370 . 0.0677
## diabetes 0.0782 0.1473 -0.2572 . 0.0691
## cancer 0.2007 0.1700 0.0767 . 0.0307
## lung 0.1429 0.1360 0.0197 . 0.0069
## eCDF Max
## distance 0.3245
## age 0.1214
## sex 0.0951
## weight 0.0621
## height 0.2586
## smoking 0.0891
## activity 0.0519
## heart 0.0252
## hypert 0.0677
## diabetes 0.0691
## cancer 0.0307
## lung 0.0069
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
## distance 0.3885 0.3637 0.1461 1.4474 0.0221
## age 65.7755 65.9762 -0.0222 0.8747 0.0263
## sex 0.8231 0.8197 0.0089 . 0.0034
## weight 68.7646 68.2112 0.0379 0.8734 0.0196
## height 156.7565 157.3483 -0.0714 1.0946 0.0172
## smoking 0.2109 0.1769 0.0834 . 0.0340
## activity 0.5816 0.5884 -0.0138 . 0.0068
## heart 0.0952 0.1088 -0.0463 . 0.0136
## hypert 0.4252 0.4286 -0.0069 . 0.0034
## diabetes 0.0782 0.0714 0.0253 . 0.0068
## cancer 0.2007 0.2007 0.0000 . 0.0000
## lung 0.1429 0.1361 0.0194 . 0.0068
## eCDF Max Std. Pair Dist.
## distance 0.1293 0.1471
## age 0.0782 1.1344
## sex 0.0034 0.7042
## weight 0.0884 1.1813
## height 0.0612 0.8181
## smoking 0.0340 0.7004
## activity 0.0068 0.9929
## heart 0.0136 0.6025
## hypert 0.0034 1.0252
## diabetes 0.0068 0.5067
## cancer 0.0000 0.2653
## lung 0.0068 0.6610
##
## Sample Sizes:
## Control Treated
## All 706 294
## Matched 294 294
## Unmatched 412 0
## Discarded 0 0
##4.2 So sánh khác biệt trước và sau bắt cặp ###4.3 So sánh khác biệt bằng biểu đồ
library(cobalt)
## cobalt (Version 4.6.1, Build Date: 2025-08-20)
##
## Attaching package: 'cobalt'
## The following object is masked from 'package:MatchIt':
##
## lalonde
love.plot(match.out, stats = "mean.diffs", threshold = 0.1, abs = TRUE, stars = "raw", var.order = "unadjusted", sample.names = c("Chưa bắt cặp", "Bắt cặp"))
###4.4 Phân tích propensity score bắt cặp
ps.matched = match.data(match.out)
table1(~ as.factor(death) | treat, data = ps.matched)
## Warning in table1.formula(~as.factor(death) | treat, data = ps.matched): Terms
## to the right of '|' in formula 'x' define table columns and are expected to be
## factors with meaningful labels.
| 0 (N=294) |
1 (N=294) |
Overall (N=588) |
|
|---|---|---|---|
| as.factor(death) | |||
| 0 | 247 (84.0%) | 246 (83.7%) | 493 (83.8%) |
| 1 | 47 (16.0%) | 48 (16.3%) | 95 (16.2%) |
psa.matched = glm(death ~ treat, family = binomial(), data = ps.matched)
summary(psa.matched)
##
## Call:
## glm(formula = death ~ treat, family = binomial(), data = ps.matched)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.65924 0.15914 -10.426 <2e-16 ***
## treat 0.02511 0.22411 0.112 0.911
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 520.09 on 587 degrees of freedom
## Residual deviance: 520.08 on 586 degrees of freedom
## AIC: 524.08
##
## Number of Fisher Scoring iterations: 3
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(psa.matched)
##
## Logistic regression predicting death
##
## OR(95%CI) P(Wald's test) P(LR-test)
## treat: 1 vs 0 1.03 (0.66,1.59) 0.911 0.911
##
## Log-likelihood = -260.0398
## No. of observations = 588
## AIC value = 524.0797
##Việc 5. Phân tích hiệu chỉnh cho propensity score ###5.1 Phân tích hiệu chỉnh
psa.adj = glm(death ~ treat + p.score, family = binomial(), data = ps)
summary(psa.adj)
##
## Call:
## glm(formula = death ~ treat + p.score, family = binomial(), data = ps)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.92027 0.15967 -5.764 8.23e-09 ***
## treat -0.04003 0.19664 -0.204 0.83869
## p.score -1.81538 0.55690 -3.260 0.00111 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 978.22 on 999 degrees of freedom
## Residual deviance: 964.82 on 997 degrees of freedom
## AIC: 970.82
##
## Number of Fisher Scoring iterations: 4
logistic.display(psa.matched)
##
## Logistic regression predicting death
##
## OR(95%CI) P(Wald's test) P(LR-test)
## treat: 1 vs 0 1.03 (0.66,1.59) 0.911 0.911
##
## Log-likelihood = -260.0398
## No. of observations = 588
## AIC value = 524.0797
###5.2 Nhận xét kết quả ##Việc 6. ChatGPT #### PROMPT: Tôi dự kiến thực hiện phân tích propensity score (Propensity Score Analysis) để so sánh tỉ lệ tử vong (biến “death”) giữa 2 nhóm can thiệp (“treat”). Bạn cung cấp R codes để (i) xây dựng mô hình logistic tiên lượng xác suất thuộc nhóm can thiệp (“treat”) với các biến số gồm age, sex, weight, height, smoking, activity, heart, hypert, diabetes, cancer, lung; (ii) ước tính propensity score cho mỗi cá nhân trong nghiên cứu; (iii) bắt cặp bằng propensity score và so sánh tỉ lệ tử vong (“death”) giữa 2 nhóm bằng hồi qui logistic; và (iv) so sánh tỉ lệ tử vong bằng hồi qui logistic có hiệu chỉnh cho propensity score trong toàn mẫu nghiên cứu