EPI 271 Propensity Score Lab 2

Create dataset

## Load data
library(sas7bdat)
dat <- read.sas7bdat("./epi271data.sas7bdat")
names(dat) <- tolower(names(dat))

## Create new variables
dat <- within(dat, {
    ## age ≥ 70
    age70 <- as.numeric(age >= 70)

    ## submart ≥ 90
    sumbart90 <- as.numeric(sumbartel >= 90)

    ## rankpre 4,5,6 to 5
    rankpre[rankpre %in% c(4,5,6)] <- 5

    ## Categorical time
    timeintcat <- cut(timeint, breaks = c(-Inf,1,3,5,Inf), labels = 1:4,
                      right = FALSE, include.lowest = T)

    ## Quantile of resident
    residentq <- cut(residents, breaks = quantile(residents), labels = 1:4, include.lowest = T)
})

## Convert categoricals to factors
categoricals <- c("age5","age70","afib","aphasia","living","gender","rankpre","time","residentq","referral","paresis","sumbart90","timeintcat","transport","vigilanz","ward")

dat[categoricals] <- lapply(dat[categoricals],
                            function(var) {
                                var <- factor(var)
                            })

1. Identify your set of confounding variables

age5 afib aphasia cardiac gender htn hyperchol icu living rankpre residentq referral paresis prevstroke sumbart90 transport timeintcat vigilanz ward timeintcat*year age70*year

2. Calculate the association between t-PA treatment and death controlling for potential confounders using a traditional logistic regression model

## Fit a traditional logistic regression model
logit.traditional <- glm(death ~ tpa + age5 + afib + aphasia + cardiac + gender + htn + hyperchol + icu + living + rankpre + residentq + referral + paresis + prevstroke + sumbart90 + transport + timeintcat + vigilanz + ward + timeintcat:year + age70:year,
                         data = dat,
                         family = binomial(link = "logit"))
## Wald test
summary(logit.traditional)


Call:
glm(formula = death ~ tpa + age5 + afib + aphasia + cardiac + 
    gender + htn + hyperchol + icu + living + rankpre + residentq + 
    referral + paresis + prevstroke + sumbart90 + transport + 
    timeintcat + vigilanz + ward + timeintcat:year + age70:year, 
    family = binomial(link = "logit"), data = dat)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.868  -0.316  -0.195  -0.106   3.328  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -3.7701     0.5617   -6.71  1.9e-11 ***
tpa                0.6493     0.2093    3.10  0.00192 ** 
age52              1.1180     0.3490    3.20  0.00136 ** 
age53              1.0534     0.3487    3.02  0.00252 ** 
age54              1.2830     0.3352    3.83  0.00013 ***
age55              1.4219     0.3288    4.33  1.5e-05 ***
age56              1.2855     0.3277    3.92  8.7e-05 ***
age57              1.5415     0.3365    4.58  4.6e-06 ***
age58              1.7201     0.3344    5.14  2.7e-07 ***
afib1              0.3240     0.1087    2.98  0.00288 ** 
aphasia1           0.0918     0.1150    0.80  0.42473    
cardiac            0.1852     0.1089    1.70  0.08895 .  
gender1           -0.5661     0.1932   -2.93  0.00339 ** 
gender2           -0.4122     0.1872   -2.20  0.02765 *  
htn               -0.0124     0.1157   -0.11  0.91444    
hyperchol         -0.4905     0.1380   -3.55  0.00038 ***
icu                0.1338     0.2637    0.51  0.61205    
living1           -0.1849     0.1774   -1.04  0.29750    
living2           -0.1434     0.1551   -0.92  0.35508    
living3            0.0309     0.2392    0.13  0.89734    
rankpre1          -0.5071     0.2492   -2.03  0.04186 *  
rankpre2          -0.6026     0.2612   -2.31  0.02105 *  
rankpre3          -0.4020     0.2785   -1.44  0.14895    
rankpre5          -0.1540     0.2727   -0.56  0.57211    
residentq2        -0.4823     0.1488   -3.24  0.00119 ** 
residentq3        -0.3720     0.1472   -2.53  0.01149 *  
residentq4        -0.5688     0.1440   -3.95  7.8e-05 ***
referral1         -0.0456     0.2301   -0.20  0.84306    
referral2          0.0542     0.2206    0.25  0.80602    
referral3          0.2877     0.2069    1.39  0.16436    
referral4          0.0979     0.2440    0.40  0.68820    
paresis1          -0.2921     0.3158   -0.93  0.35493    
paresis2           0.4372     0.1582    2.76  0.00572 ** 
paresis3           0.6542     0.3116    2.10  0.03579 *  
prevstroke        -0.2971     0.1448   -2.05  0.04019 *  
sumbart901        -0.3011     0.2091   -1.44  0.14978    
transport1        -1.1560     0.3563   -3.24  0.00118 ** 
transport2         0.5993     0.2199    2.73  0.00643 ** 
transport3         0.1434     0.2194    0.65  0.51347    
transport4        -0.2055     0.3706   -0.55  0.57910    
timeintcat2        0.4682     0.3272    1.43  0.15251    
timeintcat3       -0.2626     0.4559   -0.58  0.56463    
timeintcat4       -0.5497     0.3732   -1.47  0.14076    
vigilanz1          2.0840     0.2498    8.34  < 2e-16 ***
vigilanz2          0.8826     0.1895    4.66  3.2e-06 ***
vigilanz3         -0.4024     0.1779   -2.26  0.02370 *  
ward1              0.4670     0.2376    1.97  0.04933 *  
ward2              0.6719     0.2051    3.28  0.00105 ** 
ward3              1.2874     0.2498    5.15  2.6e-07 ***
timeintcat1:year  -0.0850     0.0768   -1.11  0.26828    
timeintcat2:year  -0.2690     0.0842   -3.19  0.00141 ** 
timeintcat3:year  -0.0733     0.1283   -0.57  0.56810    
timeintcat4:year  -0.0198     0.0975   -0.20  0.83943    
year:age701       -0.0441     0.0353   -1.25  0.21143    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 4086.7  on 9145  degrees of freedom
Residual deviance: 3057.7  on 9092  degrees of freedom
  (854 observations deleted due to missingness)
AIC: 3166

Number of Fisher Scoring iterations: 8

## To display OR, epicalc::logistic.display() is easy.
library(epicalc)
res.epicalc <- logistic.display(logit.traditional)
res.epicalc


                     OR lower95ci upper95ci  Pr(>|Z|)
tpa              1.9141    1.2700    2.8850 1.924e-03
age52            3.0587    1.5434    6.0620 1.358e-03
age53            2.8673    1.4475    5.6798 2.524e-03
age54            3.6074    1.8700    6.9591 1.297e-04
age55            4.1449    2.1761    7.8950 1.525e-05
age56            3.6166    1.9027    6.8744 8.748e-05
age57            4.6716    2.4158    9.0340 4.622e-06
age58            5.5853    2.9001   10.7567 2.687e-07
afib1            1.3827    1.1174    1.7110 2.875e-03
aphasia1         1.0961    0.8750    1.3732 4.247e-01
cardiac          1.2034    0.9722    1.4896 8.895e-02
gender1          0.5678    0.3888    0.8291 3.388e-03
gender2          0.6622    0.4588    0.9557 2.765e-02
htn              0.9876    0.7872    1.2391 9.144e-01
hyperchol        0.6123    0.4672    0.8025 3.783e-04
icu              1.1431    0.6817    1.9168 6.121e-01
living1          0.8312    0.5870    1.1769 2.975e-01
living2          0.8664    0.6393    1.1742 3.551e-01
living3          1.0313    0.6453    1.6483 8.973e-01
rankpre1         0.6022    0.3695    0.9815 4.186e-02
rankpre2         0.5474    0.3280    0.9133 2.105e-02
rankpre3         0.6690    0.3876    1.1548 1.490e-01
rankpre5         0.8572    0.5024    1.4628 5.721e-01
residentq2       0.6173    0.4612    0.8263 1.185e-03
residentq3       0.6894    0.5166    0.9199 1.149e-02
residentq4       0.5662    0.4270    0.7508 7.784e-05
referral1        0.9555    0.6087    1.4999 8.431e-01
referral2        1.0557    0.6851    1.6265 8.060e-01
referral3        1.3333    0.8889    1.9999 1.644e-01
referral4        1.1029    0.6837    1.7790 6.882e-01
paresis1         0.7467    0.4021    1.3865 3.549e-01
paresis2         1.5484    1.1355    2.1113 5.725e-03
paresis3         1.9236    1.0444    3.5431 3.579e-02
prevstroke       0.7430    0.5594    0.9868 4.019e-02
sumbart901       0.7400    0.4912    1.1148 1.498e-01
transport1       0.3147    0.1566    0.6327 1.176e-03
transport2       1.8208    1.1832    2.8019 6.430e-03
transport3       1.1542    0.7507    1.7744 5.135e-01
transport4       0.8142    0.3938    1.6833 5.791e-01
timeintcat2      1.5971    0.8410    3.0332 1.525e-01
timeintcat3      0.7691    0.3147    1.8794 5.646e-01
timeintcat4      0.5771    0.2777    1.1993 1.408e-01
vigilanz1        8.0362    4.9248   13.1131 7.339e-17
vigilanz2        2.4172    1.6673    3.5043 3.197e-06
vigilanz3        0.6687    0.4719    0.9477 2.370e-02
ward1            1.5952    1.0014    2.5412 4.933e-02
ward2            1.9580    1.3100    2.9266 1.050e-03
ward3            3.6235    2.2206    5.9128 2.562e-07
timeintcat1:year 0.9185    0.7902    1.0677 2.683e-01
timeintcat2:year 0.7641    0.6478    0.9013 1.406e-03
timeintcat3:year 0.9294    0.7227    1.1951 5.681e-01
timeintcat4:year 0.9804    0.8099    1.1869 8.394e-01
year:age701      0.9569    0.8929    1.0254 2.114e-01

## Single term likelihood ratio test
drop1(logit.traditional, test = "LRT")

Single term deletions

Model:
death ~ tpa + age5 + afib + aphasia + cardiac + gender + htn + 
    hyperchol + icu + living + rankpre + residentq + referral + 
    paresis + prevstroke + sumbart90 + transport + timeintcat + 
    vigilanz + ward + timeintcat:year + age70:year
                Df Deviance  AIC   LRT Pr(>Chi)    
<none>                 3058 3166                   
tpa              1     3067 3173   9.0  0.00272 ** 
age5             7     3095 3189  37.1  4.6e-06 ***
afib             1     3066 3172   8.8  0.00305 ** 
aphasia          1     3058 3164   0.6  0.42697    
cardiac          1     3061 3167   2.9  0.09104 .  
gender           2     3066 3170   8.3  0.01609 *  
htn              1     3058 3164   0.0  0.91449    
hyperchol        1     3071 3177  13.5  0.00024 ***
icu              1     3058 3164   0.3  0.60815    
living           3     3059 3161   1.8  0.62353    
rankpre          4     3068 3168  10.2  0.03735 *  
residentq        3     3077 3179  19.6  0.00021 ***
referral         4     3064 3164   6.2  0.18541    
paresis          3     3072 3174  14.3  0.00253 ** 
prevstroke       1     3062 3168   4.4  0.03681 *  
sumbart90        1     3060 3166   2.2  0.14103    
transport        4     3107 3207  49.6  4.5e-10 ***
vigilanz         3     3260 3362 201.8  < 2e-16 ***
ward             3     3087 3189  29.8  1.5e-06 ***
timeintcat:year  4     3069 3169  11.5  0.02162 *  
year:age70       1     3059 3165   1.6  0.20892    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

a. What is the multivariable adjusted odds ratio?

OR 1.91 [1.27, 2.88] for TPA.

res.epicalc$table["tpa",,drop = F]

       OR lower95ci upper95ci Pr(>|Z|)
tpa 1.914      1.27     2.885 0.001924

3. Did the odds ratio change compared to the crude odds ratio? By how much?

Down from 3.17.

Crude analysis

logistic.display(glm(death ~ tpa, data = dat, family = binomial(link = "logit")))


Logistic regression predicting death 

             OR(95%CI)         P(Wald's test) P(LR-test)
tpa: 1 vs 0  3.17 (2.33,4.31)  < 0.001        < 0.001   

Log-likelihood = -2234.6526
No. of observations = 10000
AIC value = 4473.3053

4. Use your “confounder set” and calculate the propensity score for t-PA treatment for the study population

logit.ps <- glm(tpa ~ age5 + afib + aphasia + cardiac + gender + htn + hyperchol + icu + living + rankpre + residentq + referral + paresis + prevstroke + sumbart90 + transport + timeintcat + vigilanz + ward + timeintcat:year + age70:year,
                data = dat,
                family = binomial(link = "logit"))

5. What is the range of the propensity score? Does the range differ for treated and untreated?

There are very few patients who received TPA, and most of the non-TPA patients have very low propensity scores.

## Extract the fitted probabilities (propensity scores)
pscores <- fitted(logit.ps)

## Put them back into the dataset using case numbers (those with missing values)
dat$pscore <- NA
dat$pscore[as.numeric(names(pscores))] <- pscores

## Summaries
summary(dat$pscore)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     0.0     0.0     0.0     0.0     0.8     854

## Summaries by group
library(doBy)
res.group.summ <- summaryBy(pscore ~ tpa, dat, FUN = summary)
round(res.group.summ, 5)

  tpa pscore.Min. pscore.1st Qu. pscore.Median pscore.Mean pscore.3rd Qu. pscore.Max. pscore.NA's
1   0     0.00000        0.00034       0.00184      0.0239         0.0107       0.740         825
2   1     0.00022        0.09270       0.28500      0.2910         0.4370       0.819          29


## Histograms
library(ggplot2)
ggplot(data = dat[,c("pscore","tpa")], mapping = aes(x = pscore, fill = factor(tpa))) +
    layer(geom = "histogram", stat = "bin", binwidth = 0.02, color = "white") +
    facet_grid(tpa ~ ., scales = "free") +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-7


## Density plots
ggplot(data = dat[,c("pscore","tpa")], mapping = aes(x = pscore, fill = factor(tpa))) +
    layer(geom = "density", stat = "density") +
    facet_grid(tpa ~ ., scales = "fixed") +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-7


## Box plots
ggplot(data = dat[,c("pscore","tpa")], mapping = aes(x = factor(tpa), y = pscore, fill = factor(tpa))) +
    layer(geom = "boxplot") + 
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-7

6. What is the c-statistic of your propensity score model?

## Load pROC
library(pROC)

## Construct an ROC
roc <- roc(logit.ps$model$tpa ~ logit.ps$fitted.values)

## Plot
plot(roc)

plot of chunk unnamed-chunk-8


Call:
roc.formula(formula = logit.ps$model$tpa ~ logit.ps$fitted.values)

Data: logit.ps$fitted.values in 8847 controls (logit.ps$model$tpa 0) < 299 cases (logit.ps$model$tpa 1).
Area under the curve: 0.939