Using matchthem function for imputed data.
It also can apply to 1 imputed dataset if letting m=1.
set.seed(123)
# load dataset
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(MatchThem)
##
## Attaching package: 'MatchThem'
## The following objects are masked from 'package:mice':
##
## cbind, pool
## The following object is masked from 'package:base':
##
## cbind
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
load("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\osteoarthritis.RData")
# check NA,
# a quick view data summary
summary(osteoarthritis)
## AGE SEX BMI RAC SMK OSP
## Min. :60.0 1: 994 Min. :17.20 0 : 42 0 :1261 0:2106
## 1st Qu.:64.0 2:1591 1st Qu.:25.10 1 :2135 1 :1296 1: 479
## Median :68.0 Median :28.10 2 : 383 NA's: 28
## Mean :68.4 Mean :28.38 3 : 23
## 3rd Qu.:73.0 3rd Qu.:31.20 NA's: 2
## Max. :79.0 Max. :48.70
## NA's :1
## KOA
## 0 :1197
## 1 :1195
## NA's: 193
##
##
##
##
md.pattern(osteoarthritis)
## AGE SEX OSP BMI RAC SMK KOA
## 2363 1 1 1 1 1 1 1 0
## 191 1 1 1 1 1 1 0 1
## 27 1 1 1 1 1 0 1 1
## 1 1 1 1 1 1 0 0 2
## 1 1 1 1 1 0 1 1 1
## 1 1 1 1 1 0 1 0 2
## 1 1 1 1 0 1 1 1 1
## 0 0 0 1 2 28 193 224
imputed.datasets <- mice(osteoarthritis, m = 1) #you can impute a no missing variable
##
## iter imp variable
## 1 1 BMI RAC SMK KOA
## 2 1 BMI RAC SMK KOA
## 3 1 BMI RAC SMK KOA
## 4 1 BMI RAC SMK KOA
## 5 1 BMI RAC SMK KOA
# Get completed data sets
CDF = complete(imputed.datasets, 'long')
md.pattern(CDF)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## .imp .id AGE SEX BMI RAC SMK OSP KOA
## 2585 1 1 1 1 1 1 1 1 1 0
## 0 0 0 0 0 0 0 0 0 0
imputed.datasets <- mice(CDF, m = 2)
##
## iter imp variable
## 1 1
## 1 2
## 2 1
## 2 2
## 3 1
## 3 2
## 4 1
## 4 2
## 5 1
## 5 2
## Warning: Number of logged events: 1
# Matching the imputed datasets on the propensity score
matched.datasets <- matchthem(OSP ~ AGE + SEX + BMI + RAC + SMK,
datasets = imputed.datasets,
approach = 'within',
method = 'nearest', #k nearest method
caliper = 0.05,
ratio = 1)
##
## Matching Observations | dataset: #1 #2
library(cobalt)
## cobalt (Version 4.5.0, Build Date: 2023-03-21)
bal.tab(matched.datasets, stats = c('m', 'ks'),
imp.fun = 'max')
## Balance summary across all imputations
## Type Max.Diff.Adj Max.KS.Adj
## distance Distance 0.0071 0.0300
## AGE Contin. 0.0309 0.0471
## SEX_2 Binary 0.0043 0.0043
## BMI Contin. 0.0046 0.0407
## RAC_0 Binary 0.0021 0.0021
## RAC_1 Binary 0.0193 0.0193
## RAC_2 Binary 0.0214 0.0214
## RAC_3 Binary 0.0043 0.0043
## SMK Binary 0.0236 0.0236
##
## Average sample sizes across imputations
## 0 1
## All 2106 479
## Matched 467 467
## Unmatched 1639 12
Differneces are close to zero.
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
matched.models <- with(matched.datasets,
svyglm(KOA ~ OSP, family = quasibinomial()),
cluster = TRUE)
no svydesign() or weights arguments need to be specified as these are automatically supplied by with().
# we must pool the estimated models using pool(), containing the models we fit above
matched.results <- pool(matched.models)
summary(matched.results, conf.int = TRUE)
## term estimate std.error statistic df p.value 2.5 %
## 1 (Intercept) -0.2279628 0.09325069 -2.4446235 462.9644 0.01487225 -0.4112099
## 2 OSP1 -0.0260712 0.13222252 -0.1971767 462.9644 0.84377580 -0.2859018
## 97.5 %
## 1 -0.04471578
## 2 0.23375944
Imputing the missing data, produces the 5 imputed datasets.
Weighting the imputed datasets.
weighted.datasets <- weightthem(OSP ~ AGE + SEX + BMI + RAC + SMK,
datasets = imputed.datasets,
approach = 'across',
method = 'ps', #propensity score
estimand = 'ATM')
## Estimating distances | dataset: #1 #2
## Estimating weights | dataset: #1 #2
# and the output of the calls applied to each imputed dataset.
bal.tab(weighted.datasets, stats = c('m', 'ks'),
imp.fun = 'max')
## Balance summary across all imputations
## Type Max.Diff.Adj Max.KS.Adj
## prop.score Distance 0.0029 0.0426
## AGE Contin. 0.0205 0.0376
## SEX_2 Binary 0.0002 0.0002
## BMI Contin. 0.0065 0.0465
## RAC_0 Binary 0.0002 0.0002
## RAC_1 Binary 0.0006 0.0006
## RAC_2 Binary 0.0014 0.0014
## RAC_3 Binary 0.0006 0.0006
## SMK Binary 0.0110 0.0110
##
## Average effective sample sizes across imputations
## 0 1
## Unadjusted 2106. 479.
## Adjusted 954.53 478.31
library(survey)
weighted.models <- with(weighted.datasets,
svyglm(KOA ~ OSP, family = quasibinomial()))
#doesn't work for only 1 imputation
weighted.results <- pool(weighted.models)
summary(weighted.results, conf.int = TRUE)
## term estimate std.error statistic df p.value 2.5 %
## 1 (Intercept) -0.18997635 0.06455624 -2.9428037 2580.678 0.003281544 -0.3165636
## 2 OSP1 -0.09194026 0.11269584 -0.8158265 2580.678 0.414674650 -0.3129237
## 97.5 %
## 1 -0.06338907
## 2 0.12904318
Table 1 in RHC data
# load the dataset
ObsData <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\obsdata.csv", header = TRUE)
# add column for outcome Y: length of stay
# Y = date of discharge - study admission date
# Y = date of death - study admission date if date of discharge not available
ObsData$Y <- ObsData$dschdte - ObsData$sadmdte
ObsData$Y[is.na(ObsData$Y)] <- ObsData$dthdte[is.na(ObsData$Y)] -
ObsData$sadmdte[is.na(ObsData$Y)]
# remove outcomes we are not examining in this example
ObsData <- dplyr::select(ObsData,
!c(dthdte, lstctdte, dschdte, death, t3d30, dth30, surv2md1))
# remove unnecessary and problematic variables
ObsData <- dplyr::select(ObsData,
!c(sadmdte, ptid, X, adld3p, urin1, cat2))
# convert all categorical variables to factors
factors <- c("cat1", "ca", "cardiohx", "chfhx", "dementhx", "psychhx",
"chrpulhx", "renalhx", "liverhx", "gibledhx", "malighx",
"immunhx", "transhx", "amihx", "sex", "dnr1", "ninsclas",
"resp", "card", "neuro", "gastr", "renal", "meta", "hema",
"seps", "trauma", "ortho", "race", "income")
ObsData[factors] <- lapply(ObsData[factors], as.factor)
# convert our treatment A (RHC vs. No RHC) to a binary variable
ObsData$A <- ifelse(ObsData$swang1 == "RHC", 1, 0)
ObsData <- dplyr::select(ObsData, !swang1)
# Categorize the variables to match with the original paper
ObsData$age <- cut(ObsData$age,breaks=c(-Inf, 50, 60, 70, 80, Inf),right=FALSE)
ObsData$race <- factor(ObsData$race, levels=c("white","black","other"))
ObsData$sex <- as.factor(ObsData$sex)
ObsData$sex <- relevel(ObsData$sex, ref = "Male")
ObsData$cat1 <- as.factor(ObsData$cat1)
levels(ObsData$cat1) <- c("ARF","CHF","Other","Other","Other",
"Other","Other","MOSF","MOSF")
ObsData$ca <- as.factor(ObsData$ca)
levels(ObsData$ca) <- c("Metastatic","None","Localized (Yes)")
ObsData$ca <- factor(ObsData$ca, levels=c("None",
"Localized (Yes)","Metastatic"))
# Rename variables
names(ObsData) <- c("Disease.category", "Cancer", "Cardiovascular",
"Congestive.HF", "Dementia", "Psychiatric", "Pulmonary",
"Renal", "Hepatic", "GI.Bleed", "Tumor",
"Immunosupperssion", "Transfer.hx", "MI", "age", "sex",
"edu", "DASIndex", "APACHE.score", "Glasgow.Coma.Score",
"blood.pressure", "WBC", "Heart.rate", "Respiratory.rate",
"Temperature", "PaO2vs.FIO2", "Albumin", "Hematocrit",
"Bilirubin", "Creatinine", "Sodium", "Potassium", "PaCo2",
"PH", "Weight", "DNR.status", "Medical.insurance",
"Respiratory.Diag", "Cardiovascular.Diag",
"Neurological.Diag", "Gastrointestinal.Diag", "Renal.Diag",
"Metabolic.Diag", "Hematologic.Diag", "Sepsis.Diag",
"Trauma.Diag", "Orthopedic.Diag", "race", "income",
"Y", "A")
baselinevars <- names(dplyr::select(ObsData,
!c(A,Y)))
require(tableone)
## Loading required package: tableone
tab0 <- CreateTableOne(vars = c("age", "sex", "race", "Disease.category", "Cancer"),
data = ObsData,
strata = "A",
test = T)
print(tab0, showAllLevels = FALSE, )
## Stratified by A
## 0 1 p test
## n 3551 2184
## age (%) <0.001
## [-Inf,50) 884 (24.9) 540 (24.7)
## [50,60) 546 (15.4) 371 (17.0)
## [60,70) 812 (22.9) 577 (26.4)
## [70,80) 809 (22.8) 529 (24.2)
## [80, Inf) 500 (14.1) 167 ( 7.6)
## sex = Female (%) 1637 (46.1) 906 (41.5) 0.001
## race (%) 0.425
## white 2753 (77.5) 1707 (78.2)
## black 585 (16.5) 335 (15.3)
## other 213 ( 6.0) 142 ( 6.5)
## Disease.category (%) <0.001
## ARF 1581 (44.5) 909 (41.6)
## CHF 247 ( 7.0) 209 ( 9.6)
## Other 955 (26.9) 208 ( 9.5)
## MOSF 768 (21.6) 858 (39.3)
## Cancer (%) 0.001
## None 2652 (74.7) 1727 (79.1)
## Localized (Yes) 638 (18.0) 334 (15.3)
## Metastatic 261 ( 7.4) 123 ( 5.6)
Matching caliper setup
set.seed(111)
require(MatchIt)
## Loading required package: MatchIt
##
## Attaching package: 'MatchIt'
## The following object is masked from 'package:cobalt':
##
## lalonde
ps.formula <- as.formula(paste("A~",
paste(baselinevars, collapse = "+")))
PS.fit <- glm(ps.formula,family="binomial",
data=ObsData)
ObsData$PS <- predict(PS.fit,
newdata = ObsData, type="response")
# propensity score
logitPS <- -log(1/ObsData$PS - 1)
match.obj <- matchit(ps.formula, data =ObsData,
distance = ObsData$PS,
method = "nearest", replace=FALSE,
ratio = 1,
caliper = .2*sd(logitPS)) #matching caliper
PSM diagnostics
require(cobalt)
bal.plot(match.obj,
var.name = "distance",
which = "both",
type = "histogram",
mirror = TRUE)
Plot
bal.tab(match.obj, un = TRUE,
thresholds = c(m = .1))
## Balance Measures
## Type Diff.Un Diff.Adj M.Threshold
## distance Distance 1.1558 0.1820
## Disease.category_ARF Binary -0.0290 -0.0178 Balanced, <0.1
## Disease.category_CHF Binary 0.0261 -0.0006 Balanced, <0.1
## Disease.category_Other Binary -0.1737 -0.0092 Balanced, <0.1
## Disease.category_MOSF Binary 0.1766 0.0276 Balanced, <0.1
## Cancer_None Binary 0.0439 0.0075 Balanced, <0.1
## Cancer_Localized (Yes) Binary -0.0267 -0.0109 Balanced, <0.1
## Cancer_Metastatic Binary -0.0172 0.0035 Balanced, <0.1
## Cardiovascular Binary 0.0445 -0.0104 Balanced, <0.1
## Congestive.HF Binary 0.0268 0.0012 Balanced, <0.1
## Dementia Binary -0.0472 -0.0023 Balanced, <0.1
## Psychiatric Binary -0.0348 -0.0081 Balanced, <0.1
## Pulmonary Binary -0.0737 -0.0138 Balanced, <0.1
## Renal Binary 0.0066 -0.0058 Balanced, <0.1
## Hepatic Binary -0.0124 -0.0023 Balanced, <0.1
## GI.Bleed Binary -0.0122 -0.0006 Balanced, <0.1
## Tumor Binary -0.0423 -0.0052 Balanced, <0.1
## Immunosupperssion Binary 0.0358 -0.0046 Balanced, <0.1
## Transfer.hx Binary 0.0554 0.0115 Balanced, <0.1
## MI Binary 0.0139 -0.0012 Balanced, <0.1
## age_[-Inf,50) Binary -0.0017 0.0063 Balanced, <0.1
## age_[50,60) Binary 0.0161 0.0104 Balanced, <0.1
## age_[60,70) Binary 0.0355 0.0006 Balanced, <0.1
## age_[70,80) Binary 0.0144 -0.0132 Balanced, <0.1
## age_[80, Inf) Binary -0.0643 -0.0040 Balanced, <0.1
## sex_Female Binary -0.0462 -0.0092 Balanced, <0.1
## edu Contin. 0.0910 0.0293 Balanced, <0.1
## DASIndex Contin. 0.0654 0.0263 Balanced, <0.1
## APACHE.score Contin. 0.4837 0.0813 Balanced, <0.1
## Glasgow.Coma.Score Contin. -0.1160 -0.0147 Balanced, <0.1
## blood.pressure Contin. -0.4869 -0.0680 Balanced, <0.1
## WBC Contin. 0.0799 -0.0096 Balanced, <0.1
## Heart.rate Contin. 0.1460 -0.0005 Balanced, <0.1
## Respiratory.rate Contin. -0.1641 -0.0361 Balanced, <0.1
## Temperature Contin. -0.0209 -0.0219 Balanced, <0.1
## PaO2vs.FIO2 Contin. -0.4566 -0.0560 Balanced, <0.1
## Albumin Contin. -0.2010 -0.0281 Balanced, <0.1
## Hematocrit Contin. -0.2954 -0.0293 Balanced, <0.1
## Bilirubin Contin. 0.1329 0.0319 Balanced, <0.1
## Creatinine Contin. 0.2678 0.0339 Balanced, <0.1
## Sodium Contin. -0.0927 -0.0218 Balanced, <0.1
## Potassium Contin. -0.0274 0.0064 Balanced, <0.1
## PaCo2 Contin. -0.2880 -0.0456 Balanced, <0.1
## PH Contin. -0.1163 -0.0228 Balanced, <0.1
## Weight Contin. 0.2640 0.0241 Balanced, <0.1
## DNR.status_Yes Binary -0.0696 0.0006 Balanced, <0.1
## Medical.insurance_Medicaid Binary -0.0395 -0.0035 Balanced, <0.1
## Medical.insurance_Medicare Binary -0.0327 -0.0075 Balanced, <0.1
## Medical.insurance_Medicare & Medicaid Binary -0.0144 -0.0058 Balanced, <0.1
## Medical.insurance_No insurance Binary 0.0099 0.0046 Balanced, <0.1
## Medical.insurance_Private Binary 0.0624 0.0259 Balanced, <0.1
## Medical.insurance_Private & Medicare Binary 0.0143 -0.0138 Balanced, <0.1
## Respiratory.Diag_Yes Binary -0.1277 -0.0299 Balanced, <0.1
## Cardiovascular.Diag_Yes Binary 0.1395 0.0236 Balanced, <0.1
## Neurological.Diag_Yes Binary -0.1079 -0.0098 Balanced, <0.1
## Gastrointestinal.Diag_Yes Binary 0.0453 0.0052 Balanced, <0.1
## Renal.Diag_Yes Binary 0.0264 0.0040 Balanced, <0.1
## Metabolic.Diag_Yes Binary -0.0059 0.0017 Balanced, <0.1
## Hematologic.Diag_Yes Binary -0.0146 -0.0035 Balanced, <0.1
## Sepsis.Diag_Yes Binary 0.0912 0.0138 Balanced, <0.1
## Trauma.Diag_Yes Binary 0.0105 0.0017 Balanced, <0.1
## Orthopedic.Diag_Yes Binary 0.0010 0.0012 Balanced, <0.1
## race_white Binary 0.0063 0.0069 Balanced, <0.1
## race_black Binary -0.0114 -0.0081 Balanced, <0.1
## race_other Binary 0.0050 0.0012 Balanced, <0.1
## income_$11-$25k Binary 0.0062 -0.0104 Balanced, <0.1
## income_$25-$50k Binary 0.0391 0.0173 Balanced, <0.1
## income_> $50k Binary 0.0165 0.0086 Balanced, <0.1
## income_Under $11k Binary -0.0618 -0.0155 Balanced, <0.1
##
## Balance tally for mean differences
## count
## Balanced, <0.1 68
## Not Balanced, >0.1 0
##
## Variable with the greatest mean difference
## Variable Diff.Adj M.Threshold
## APACHE.score 0.0813 Balanced, <0.1
##
## Sample sizes
## Control Treated
## All 3551 2184
## Matched 1739 1739
## Unmatched 1812 445
love.plot(match.obj, binary = "std",
thresholds = c(m = .1))
Treatment effect calculation
library(Publish)
## Loading required package: prodlim
matched.data <- match.data(match.obj)
fit.matched <- glm(Y~A,
family=gaussian,
data = matched.data)
publish(fit.matched)
## Variable Units Coefficient CI.95 p-value
## (Intercept) 21.22 [19.94;22.49] < 1e-04
## A 3.25 [1.45;5.05] 0.0004145
library("readxl")
nhefs <- read.csv("C:\\Users\\hed2\\Downloads\\mybook2\\mybook2\\nhefs.csv")
fit3 <- glm(qsmk ~ sex + race + age + I(age*age) + as.factor(education)
+ smokeintensity + I(smokeintensity*smokeintensity) + smokeyrs
+ I(smokeyrs*smokeyrs) + as.factor(exercise) + as.factor(active)
+ wt71 + I(wt71*wt71), data=nhefs, family=binomial())
summary(fit3)
##
## Call:
## glm(formula = qsmk ~ sex + race + age + I(age * age) + as.factor(education) +
## smokeintensity + I(smokeintensity * smokeintensity) + smokeyrs +
## I(smokeyrs * smokeyrs) + as.factor(exercise) + as.factor(active) +
## wt71 + I(wt71 * wt71), family = binomial(), data = nhefs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4646 -0.8044 -0.6460 1.0578 2.3550
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9889022 1.2412792 -1.602 0.109089
## sex -0.5075218 0.1482316 -3.424 0.000617 ***
## race -0.8502312 0.2058720 -4.130 3.63e-05 ***
## age 0.1030132 0.0488996 2.107 0.035150 *
## I(age * age) -0.0006052 0.0005074 -1.193 0.232973
## as.factor(education)2 -0.0983203 0.1906553 -0.516 0.606066
## as.factor(education)3 0.0156987 0.1707139 0.092 0.926730
## as.factor(education)4 -0.0425260 0.2642761 -0.161 0.872160
## as.factor(education)5 0.3796632 0.2203947 1.723 0.084952 .
## smokeintensity -0.0651561 0.0147589 -4.415 1.01e-05 ***
## I(smokeintensity * smokeintensity) 0.0008461 0.0002758 3.067 0.002160 **
## smokeyrs -0.0733708 0.0269958 -2.718 0.006571 **
## I(smokeyrs * smokeyrs) 0.0008384 0.0004435 1.891 0.058669 .
## as.factor(exercise)1 0.2914117 0.1735543 1.679 0.093136 .
## as.factor(exercise)2 0.3550517 0.1799293 1.973 0.048463 *
## as.factor(active)1 0.0108754 0.1298320 0.084 0.933243
## as.factor(active)2 0.0683123 0.2087269 0.327 0.743455
## wt71 -0.0128478 0.0222829 -0.577 0.564226
## I(wt71 * wt71) 0.0001209 0.0001352 0.895 0.370957
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1876.3 on 1628 degrees of freedom
## Residual deviance: 1766.7 on 1610 degrees of freedom
## AIC: 1804.7
##
## Number of Fisher Scoring iterations: 4
nhefs$ps <- predict(fit3, nhefs, type="response")
summary(nhefs$ps[nhefs$qsmk==0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05298 0.16949 0.22747 0.24504 0.30441 0.65788
summary(nhefs$ps[nhefs$qsmk==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.06248 0.22046 0.28897 0.31240 0.38122 0.79320
# alternative plot with histograms
nhefs <- nhefs %>% mutate(qsmklabel = ifelse(qsmk == 1,
yes = 'Quit Smoking 1971-1982',
no = 'Did Not Quit Smoking 1971-1982'))
ggplot(nhefs, aes(x = ps, fill = as.factor(qsmk), color = as.factor(qsmk))) +
geom_histogram(alpha = 0.3, position = 'identity', bins=15) +
facet_grid(as.factor(qsmk) ~ .) +
xlab('Probability of Quitting Smoking During Follow-up') +
ggtitle('Propensity Score Distribution by Treatment Group') +
scale_fill_discrete('') +
scale_color_discrete('') +
theme(legend.position = 'bottom', legend.direction = 'vertical')
# calculation of deciles
nhefs$ps.dec <- cut(nhefs$ps,
breaks=c(quantile(nhefs$ps, probs=seq(0,1,0.1))),
labels=seq(1:10),
include.lowest=TRUE)
# regression on PS deciles, not allowing for effect modification
fit.psdec <- glm(wt82_71 ~ qsmk + as.factor(ps.dec), data = nhefs)
summary(fit.psdec)
##
## Call:
## glm(formula = wt82_71 ~ qsmk + as.factor(ps.dec), data = nhefs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -43.543 -3.932 -0.085 4.233 46.773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7505 0.6089 6.159 9.29e-10 ***
## qsmk 3.5005 0.4571 7.659 3.28e-14 ***
## as.factor(ps.dec)2 -0.7391 0.8611 -0.858 0.3908
## as.factor(ps.dec)3 -0.6182 0.8612 -0.718 0.4730
## as.factor(ps.dec)4 -0.5204 0.8584 -0.606 0.5444
## as.factor(ps.dec)5 -1.4884 0.8590 -1.733 0.0834 .
## as.factor(ps.dec)6 -1.6227 0.8675 -1.871 0.0616 .
## as.factor(ps.dec)7 -1.9853 0.8681 -2.287 0.0223 *
## as.factor(ps.dec)8 -3.4447 0.8749 -3.937 8.61e-05 ***
## as.factor(ps.dec)9 -5.1544 0.8848 -5.825 6.91e-09 ***
## as.factor(ps.dec)10 -4.8403 0.8828 -5.483 4.87e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 58.42297)
##
## Null deviance: 97176 on 1565 degrees of freedom
## Residual deviance: 90848 on 1555 degrees of freedom
## (63 observations deleted due to missingness)
## AIC: 10827
##
## Number of Fisher Scoring iterations: 2
confint.lm(fit.psdec)
## 2.5 % 97.5 %
## (Intercept) 2.556098 4.94486263
## qsmk 2.603953 4.39700504
## as.factor(ps.dec)2 -2.428074 0.94982494
## as.factor(ps.dec)3 -2.307454 1.07103569
## as.factor(ps.dec)4 -2.204103 1.16333143
## as.factor(ps.dec)5 -3.173337 0.19657938
## as.factor(ps.dec)6 -3.324345 0.07893027
## as.factor(ps.dec)7 -3.688043 -0.28248110
## as.factor(ps.dec)8 -5.160862 -1.72860113
## as.factor(ps.dec)9 -6.889923 -3.41883853
## as.factor(ps.dec)10 -6.571789 -3.10873731
# regression on the propensity score (linear term) p.qsmk instead by ps
model6 <- glm(wt82_71 ~ qsmk + ps, data = nhefs)
summary(model6)
##
## Call:
## glm(formula = wt82_71 ~ qsmk + ps, data = nhefs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -43.314 -4.006 -0.068 4.244 47.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.5945 0.4831 11.581 < 2e-16 ***
## qsmk 3.5506 0.4573 7.765 1.47e-14 ***
## ps -14.8218 1.7576 -8.433 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 58.28455)
##
## Null deviance: 97176 on 1565 degrees of freedom
## Residual deviance: 91099 on 1563 degrees of freedom
## (63 observations deleted due to missingness)
## AIC: 10815
##
## Number of Fisher Scoring iterations: 2
# standardization on the propensity score
# (step 1) create two new datasets, one with all treated and one with all untreated
treated <- nhefs
treated$qsmk <- 1
untreated <- nhefs
untreated$qsmk <- 0
# (step 2) predict values for everyone in each new dataset based on above model
treated$pred.y <- predict(model6, treated)
untreated$pred.y <- predict(model6, untreated)
# (step 3) compare mean weight loss had all been treated vs. that had all been untreated
mean1 <- mean(treated$pred.y, na.rm = TRUE)
mean0 <- mean(untreated$pred.y, na.rm = TRUE)
mean1 - mean0
## [1] 3.550596
Compare to regression
model=glm(KOA ~OSP+AGE + SEX + BMI + RAC + SMK,family = binomial,data=CDF)
summary(model)
##
## Call:
## glm(formula = KOA ~ OSP + AGE + SEX + BMI + RAC + SMK, family = binomial,
## data = CDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8470 -1.1072 0.5365 1.1320 1.7923
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.375170 0.721545 -7.450 9.37e-14 ***
## OSP1 -0.127249 0.113806 -1.118 0.2635
## AGE 0.037473 0.007773 4.821 1.43e-06 ***
## SEX2 0.095489 0.089653 1.065 0.2868
## BMI 0.111280 0.009979 11.151 < 2e-16 ***
## RAC1 -0.367828 0.326278 -1.127 0.2596
## RAC2 0.044105 0.341171 0.129 0.8971
## RAC3 0.531672 0.547642 0.971 0.3316
## SMK1 -0.148140 0.082398 -1.798 0.0722 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3583.5 on 2584 degrees of freedom
## Residual deviance: 3387.1 on 2576 degrees of freedom
## AIC: 3405.1
##
## Number of Fisher Scoring iterations: 4
Regression has more bias, while Matching has more variance. Matching may be more conservative than regression. Meanwhile, for Matching, there would be no need to make any functional form assumptions. The relationships between the covariates, the treatment and the outcome could be highly non-linear, as long as there is common support and the CIA holds.
At last,
are the confounders directly available as variables you can condition on?