EPI 271 Propensity Score Lab 1

1. Read in the data set on “g:\shared\epi271” (epi271data.sasbdat)

## Load data
library(sas7bdat)
dat <- read.sas7bdat("./epi271data.sas7bdat")
names(dat) <- tolower(names(dat))

## Create new variables
dat <- within(dat, {
    ## age ≥ 70
    age70 <- as.numeric(age >= 70)

    ## submart ≥ 90
    sumbart90 <- as.numeric(sumbartel >= 90)

    ## rankpre 4,5,6 to 5
    rankpre[rankpre %in% c(4,5,6)] <- 5

    ## Categorical time
    timeintcat <- cut(timeint, breaks = c(-Inf,1,3,5,Inf), labels = 1:4,
                      right = FALSE, include.lowest = T)

    ## Quantile of resident
    residentq <- cut(residents, breaks = quantile(residents), labels = 1:4, include.lowest = T)
})

## Convert categoricals to factors
categoricals <- c("age5","age70","afib","aphasia","living","gender","rankpre","time","residentq","referral","paresis","sumbart90","timeintcat","transport","vigilanz","ward")

dat[categoricals] <- lapply(dat[categoricals],
                            function(var) {
                                var <- factor(var)
                            })

2. How many observations are in data set?

nrow(dat)
[1] 10000

3. Identify the main exposure and the main outcome variable

4. What is the prevalence of the exposure? 3.28%

tab.tpa <- table(dat$tpa)
tab.tpa

   0    1 
9672  328 
prop.table(tab.tpa)

     0      1 
0.9672 0.0328 

5. What is the prevalence of the outcome? 5.95%

tab.death <- table(dat$death)
tab.death

   0    1 
9405  595 
prop.table(tab.death)

     0      1 
0.9405 0.0595 

6. What is the crude association between exposure and outcome? Significant by both chi-squared test and Fisher’s exact test

xtabs.tpa.death <- xtabs(~ tpa + death, data = dat)
chisq.test(xtabs.tpa.death, correct = F)

    Pearson's Chi-squared test

data:  xtabs.tpa.death
X-squared = 59.44, df = 1, p-value = 1.261e-14
fisher.test(xtabs.tpa.death)

    Fisher's Exact Test for Count Data

data:  xtabs.tpa.death
p-value = 5.397e-11
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 2.279 4.332
sample estimates:
odds ratio 
     3.167 

7. Identify potential confounding factors A common cause of the exposure and outcomes is a potential confounder. So factors relating to both may be confounders, but can also can be a source of selection bias (common effect).

## List covariates (other than the exposure and outcome)
covariates <- names(dat)[grep("tpa|death", names(dat), invert = T)]
covariates
 [1] "vigilanz"   "dysart"     "doppler"    "duplex"     "tcd"        "gerinn"     "hypercho"   "hyperton"  
 [9] "stunit"     "rentner"    "time"       "randnum"    "id"         "gender"     "age"        "working"   
[17] "cardiac"    "comorbid"   "residents"  "ward"       "bed_res"    "beds"       "diabetes"   "referral"  
[25] "occupat"    "department" "prevstroke" "year"       "coagulo"    "hyperchol"  "htn"        "icu"       
[33] "speech"     "ems"        "age5"       "afib"       "paresis"    "rankpre"    "rankat"     "retired"   
[41] "sumbartel"  "transport"  "living"     "carotsteno" "stdward"    "aphasia"    "timeint"    "residentq" 
[49] "timeintcat" "sumbart90"  "age70"     

## Perform univariate logistic regression against outcome for each covariate
logit.outcome.var <- lapply(dat[, covariates],
                            function(Var) {
                                ## Perform single predictor glm
                                glm(death ~ Var, data = dat, family = binomial(link = "logit"))
                            })

## Extract p-values for likelihood ratio test
factors.assoc.with.outcome <- sapply(logit.outcome.var,
                                     function(X) {
                                         ## LRT for each glm object
                                         drop1(X, test = "LRT")["Var", "Pr(>Chi)"]
                                     })


## Perform univariate logistic regression against exposure for each covariate
logit.exposure.var <- lapply(dat[, covariates],
                            function(Var) {
                                ## Perform single predictor glm
                                glm(tpa ~ Var, data = dat, family = binomial)
                            })

## Extract p-values for likelihood ratio test
factors.assoc.with.exposure <- sapply(logit.exposure.var,
                                      function(X) {
                                          drop1(X, test = "LRT")["Var", "Pr(>Chi)"]
                                      })

## Mark factors significant at p < 0.2 threshold
data.frame(outcome      = factors.assoc.with.outcome,
           outcome.sig  = factor(factors.assoc.with.outcome < 0.2, c(FALSE,TRUE), c("","*")),
           exposure     = factors.assoc.with.exposure,
           exposure.sig = factor(factors.assoc.with.exposure < 0.2, c(FALSE,TRUE), c("","*"))
           )
              outcome outcome.sig   exposure exposure.sig
vigilanz   4.762e-144           *  5.227e-07            *
dysart      1.925e-03           *  2.780e-03            *
doppler     5.280e-30           *  5.713e-04            *
duplex      2.823e-23           *  1.970e-01            *
tcd         5.823e-19           *  2.072e-01             
gerinn      4.083e-01              1.005e-01            *
hypercho    2.903e-21           *  3.037e-01             
hyperton    5.063e-01              2.109e-01             
stunit      2.835e-03           *  6.508e-02            *
rentner     1.415e-05           *  7.840e-01             
time        1.727e-03           *  3.442e-01             
randnum     7.355e-01              6.329e-01             
id          5.363e-02           *  6.764e-01             
gender      2.954e-08           *  1.760e-09            *
age         4.594e-01              8.890e-01             
working     1.477e-09           *  2.012e-05            *
cardiac     4.494e-04           *  5.207e-01             
comorbid    5.253e-02           *  2.013e-01             
residents   2.458e-03           *  2.906e-04            *
ward        8.496e-33           *  4.314e-44            *
bed_res     8.452e-01              4.968e-01             
beds        8.164e-01              5.702e-02            *
diabetes    5.290e-01              9.598e-01             
referral    3.638e-28           *  8.523e-36            *
occupat     4.630e-01              3.645e-01             
department  2.320e-06           *  7.888e-03            *
prevstroke  8.387e-01              3.425e-04            *
year        5.290e-02           *  7.759e-01             
coagulo     4.083e-01              1.005e-01            *
hyperchol   2.903e-21           *  3.037e-01             
htn         5.063e-01              2.109e-01             
icu         1.710e-01           *  2.696e-05            *
speech      1.696e-03           *  7.813e-02            *
ems         1.012e-01           *  1.947e-01            *
age5        2.412e-32           *  3.012e-13            *
afib        5.128e-38           *  5.174e-09            *
paresis     1.072e-48           *  1.030e-29            *
rankpre     6.565e-20           *  4.987e-17            *
rankat      6.054e-01              9.408e-01             
retired     1.415e-05           *  7.840e-01             
sumbartel  6.184e-195           *  9.148e-56            *
transport   1.653e-81           *  2.484e-58            *
living      1.603e-10           *  6.679e-07            *
carotsteno  1.934e-01           *  1.322e-01            *
stdward     4.927e-06           *  1.293e-22            *
aphasia     8.543e-01              4.357e-02            *
timeint     3.675e-05           *  1.156e-49            *
residentq   3.314e-07           *  1.221e-03            *
timeintcat  1.674e-31           * 2.498e-144            *
sumbart90   8.666e-47           *  2.609e-22            *
age70       4.222e-01              9.191e-01