1. Required packages

#packages
library(RCurl)
library(knitr)
library(elrm)
library(psych)
library(survival)
library(mice)
library(VIM)
library(MASS)
library(BaylorEdPsych)
library(pROC)
library(GGally)
library(tableone)
library(dplyr)
library(ggplot2)

2. Load data and create derived variables

# data read in
url <- "https://raw.githubusercontent.com/davidadambarr/EPTB_PUR/master/PURTB_raw.csv"
x <- getURL(url)
df <- read.csv(text = x)

### coerce data types and clean data
#dates
format <- "%d/%m/%Y"
df$PUR.date <- as.Date(as.character(df$PUR.date),format)
df$Date.TB.Rx.start <- as.Date(as.character(df$Date.TB.Rx.start),format)
df$Date.TB.Rx.finish <- as.Date(as.character(df$Date.TB.Rx.finish),format)
df$VitD.Rx.Date <- as.Date(as.character(df$VitD.Rx.Date),format)

# below limit of detection values
df$CRP <- as.character(df$CRP)
df$CRP[df$CRP=="<0.2"] <- "0.2"
df$CRP[df$CRP=="<3"] <- "3"
df$CRP <- as.numeric(df$CRP)

### create 'timing' variables
df$PUR.time <- as.numeric(df$PUR.date - df$Date.TB.Rx.start)

### create a 'number of sites' variable
df <- dplyr::mutate(df, NumSites=pleural+internal.LN+external.LN+CNS+BJI+pericardial+abdominal+other.site)
df <- dplyr::mutate(df, more1site=NumSites>1|miliary==1)
df$more1site <- as.factor(df$more1site)

3. Summary of data by Vit D prescription status (table 1)

### Table 1
df <- dplyr::select(df, -number)

vars <- c("age", "sex", "PUR", "vitD.Rx", "ethnicity", "clinic",
          "lymphocytes","monocytes","neutrophils","haemoglobin","ESR","albumin","CRP",
          "vitD.baseline","vitD.6mths","vitD.12mths",
          "pleural","internal.LN","external.LN","CNS", "BJI","pericardial","abdominal","miliary","other.site",
          "microscopy.diagnosis","culture.diagnosis","histology.diagnosis", "basis.diagnosis", "more1site",
          "baseline.steroid","other.immuno.drug",
          "PUR.Rx","ADR","prior.hypercalcaemia","prior.hypocalcaemia","calcaemia_during.TBRx")

# numeric site variables to factors
siteVars <- c("pleural","internal.LN","external.LN","CNS", "BJI","pericardial","abdominal","miliary","other.site")
df[ ,siteVars] <- lapply(df[ ,siteVars], as.factor)

catVars <- sapply(df, is.factor)
numVars <- sapply(df, is.numeric)
catVars <- names(df)[catVars]
numVars <- names(df)[numVars]


# nonparametric and exact tests for p values

# overall cohort
tbl1.2 <- tableone::CreateTableOne(data=df, vars=vars)
tbl1.2 <- print(tbl1.2, showAllLevels = FALSE, format = "f", 
              nonnormal=numVars,
              quote = FALSE, noSpaces = TRUE, printToggle = FALSE)

# stratified by vit D prescription status
tbl1 <- tableone::CreateTableOne(data=df, vars=vars, strata = "vitD.Rx")
tbl1 <- print(tbl1, showAllLevels = FALSE, format = "f", 
              exact=catVars, nonnormal=numVars,
              quote = FALSE, noSpaces = TRUE, printToggle = FALSE)

Printing tables. First is the overall cohort description:

kable(tbl1.2, format = "markdown")
Overall
n 249
age (median [IQR]) 36.00 [28.90, 50.70]
sex = male 158
PUR = PUR 37
vitD.Rx = VitD.Rx 57
ethnicity
African 27
Middle.Eastern 2
SE.Asian 11
South.Asian 149
White.European 60
clinic
A 104
B 62
C 53
D 30
lymphocytes (median [IQR]) 1.40 [1.02, 1.90]
monocytes (median [IQR]) 0.60 [0.43, 0.80]
neutrophils (median [IQR]) 4.50 [3.40, 6.08]
haemoglobin (median [IQR]) 130.00 [115.00, 140.00]
ESR (median [IQR]) 29.00 [13.00, 51.00]
albumin (median [IQR]) 34.00 [30.00, 38.00]
CRP (median [IQR]) 23.00 [5.00, 65.00]
vitD.baseline (median [IQR]) 10.00 [7.00, 19.00]
vitD.6mths (median [IQR]) 26.00 [13.00, 44.75]
vitD.12mths (median [IQR]) 7.00 [7.00, 17.75]
pleural = 1 48
internal.LN = 1 134
external.LN = 1 96
CNS = 1 6
BJI = 1 36
pericardial = 1 9
abdominal = 1 47
miliary = 1 2
other.site = 1 22
microscopy.diagnosis = positive 80
culture.diagnosis = positive 153
histology.diagnosis = positive 138
basis.diagnosis = micro_or_histo_confirmed 222
more1site = TRUE 96
baseline.steroid = Yes 54
other.immuno.drug = Yes 11
PUR.Rx
Extended.course.TB.therapy 5
None 12
NSAID 2
Oral.Steroid 11
Percutaneous.Aspiration 2
Percutaneous.Aspiration.Oral.Steroid 3
Surgery 1
ADR = No_recorded_ADR 182
prior.hypercalcaemia = Prior_hypercalcaemia 15
prior.hypocalcaemia = Prior_hypocalcaemia 7
calcaemia_during.TBRx
hypercalcaemic 10
hypocalcaemia 4
normocalcaemic 192

Second, whole cohort description stratified by vitamin D prescription status:

kable(tbl1, format = "markdown")
Nil.vitD.Rx VitD.Rx p test
n 192 57
age (median [IQR]) 38.65 [29.48, 52.33] 32.70 [26.80, 43.40] 0.055 nonnorm
sex = male 123 35 0.755 exact
PUR = PUR 20 17 0.001 exact
vitD.Rx = VitD.Rx 0 57 <0.001 exact
ethnicity 0.039 exact
African 18 9
Middle.Eastern 1 1
SE.Asian 10 1
South.Asian 110 39
White.European 53 7
clinic <0.001 exact
A 97 7
B 38 24
C 42 11
D 15 15
lymphocytes (median [IQR]) 1.40 [1.04, 1.90] 1.34 [0.97, 1.82] 0.592 nonnorm
monocytes (median [IQR]) 0.61 [0.50, 0.80] 0.56 [0.40, 0.80] 0.177 nonnorm
neutrophils (median [IQR]) 4.55 [3.40, 5.90] 4.41 [3.38, 6.78] 0.665 nonnorm
haemoglobin (median [IQR]) 130.00 [116.00, 141.00] 125.50 [111.00, 137.00] 0.223 nonnorm
ESR (median [IQR]) 33.00 [13.00, 50.00] 26.50 [10.00, 61.25] 0.944 nonnorm
albumin (median [IQR]) 35.00 [30.00, 38.00] 33.00 [28.00, 37.00] 0.131 nonnorm
CRP (median [IQR]) 21.00 [4.60, 64.75] 29.00 [6.00, 65.00] 0.505 nonnorm
vitD.baseline (median [IQR]) 16.00 [7.00, 26.00] 7.00 [7.00, 10.00] 0.004 nonnorm
vitD.6mths (median [IQR]) 24.00 [18.75, 28.50] 26.00 [9.25, 58.50] 0.846 nonnorm
vitD.12mths (median [IQR]) 8.50 [7.00, 17.75] 7.00 [7.00, 15.75] 0.817 nonnorm
pleural = 1 39 9 0.567 exact
internal.LN = 1 103 31 1.000 exact
external.LN = 1 70 26 0.219 exact
CNS = 1 4 2 0.623 exact
BJI = 1 21 15 0.009 exact
pericardial = 1 6 3 0.432 exact
abdominal = 1 34 13 0.441 exact
miliary = 1 1 1 0.406 exact
other.site = 1 17 5 1.000 exact
microscopy.diagnosis = positive 62 18 1.000 exact
culture.diagnosis = positive 111 42 0.031 exact
histology.diagnosis = positive 103 35 0.363 exact
basis.diagnosis = micro_or_histo_confirmed 167 55 0.051 exact
more1site = TRUE 70 26 0.219 exact
baseline.steroid = Yes 42 12 1.000 exact
other.immuno.drug = Yes 10 1 0.465 exact
PUR.Rx 0.058 exact
Extended.course.TB.therapy 5 0
None 6 6
NSAID 0 2
Oral.Steroid 4 7
Percutaneous.Aspiration 2 0
Percutaneous.Aspiration.Oral.Steroid 1 2
Surgery 1 0
ADR = No_recorded_ADR 140 42 0.864 exact
prior.hypercalcaemia = Prior_hypercalcaemia 14 1 0.203 exact
prior.hypocalcaemia = Prior_hypocalcaemia 4 3 0.362 exact
calcaemia_during.TBRx 0.879 exact
hypercalcaemic 7 3
hypocalcaemia 3 1
normocalcaemic 144 48

Note that there are several potential confounders including clinic site and ethnicity as predicted a priori.

4. PUR descrition

Distribution of timing first PUR event in those patients who experienced a PUR.

# timing
summary(df$PUR.time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    10.0    27.0    52.5   100.4   167.8   504.0     213
ggplot(df, aes(x=PUR.time)) + geom_histogram() + ggtitle("Timing of first PUR, days from start TB Rx")

ggplot(df, aes(x=log(PUR.time))) + geom_histogram(bins=10)+
  ggtitle("Timing of first PUR, log(days)")

time <- df$PUR.time
event <- as.numeric(df$PUR=="PUR")
# looking at PUR positive patients only:
kmsurvival <- survfit(Surv(time,event) ~ 1)
plot(kmsurvival, xlab="Time in days from start RHZE",ylab="Proportion no PUR")
title(main = "Timing of PURs")

Treatment modalities used for PUR cases.

# treatment of PUR
table(df$PUR.Rx)
## 
##           Extended.course.TB.therapy                                 None 
##                                    5                                   12 
##                                NSAID                         Oral.Steroid 
##                                    2                                   11 
##              Percutaneous.Aspiration Percutaneous.Aspiration.Oral.Steroid 
##                                    2                                    3 
##                              Surgery 
##                                    1

5. Distribution of numerical variables

Several of the distributions look more normal after transformation. For example, lymphocyte count:

par(mfrow=c(2,2))
hist(df$lymphocytes, main = "Lymphocyte count histogram")
qqnorm(df$lymphocytes, main = "Lymphocyte count QQ plot")
hist(log(df$lymphocytes+1), main = "After transformation histogram")
qqnorm(log(df$lymphocytes+1), main = "After transformation QQ plot")

par(mfrow=c(1,1))

6. Missing data

Here, proportion of observations missing is plotted by variable:

vars <- c("PUR","ethnicity","clinic","age", "sex",
          "more1site",
          "microscopy.diagnosis", "culture.diagnosis", "histology.diagnosis",
          "baseline.steroid", "other.immuno.drug", 
          "lymphocytes", "monocytes", "neutrophils", "haemoglobin", 
          "ESR", "albumin", "CRP","vitD.baseline","vitD.6mths", "vitD.Rx")
dat <- df[ ,vars]

# check for missingness by variable
missing <- data.frame(colSums(is.na(dat)))
vars <- rownames(missing)
missing <- cbind(missing, vars)
n <- nrow(dat)
names(missing) <- c("n_missing", "vars")
missing$vars <- as.character(missing$vars)
missing <- dplyr::mutate(missing, prop_missing=n_missing/n)
p2 <- ggplot(missing, aes(x=reorder(vars, -prop_missing), y=prop_missing))
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=90, hjust=1)) +
  ggtitle("Proportion missing by variable")

Here, all the data is visualised by variable (columns) and by patient (rows). Higher values are a darker greyscale shade, absence of a binary variable is white and presence of a binary variable is black. Missing observations are coloured red.

mar.default <- par("mar")
cex.default <- par("cex")
par(mar=c(12, 4.1,4.1,2.1), cex=0.7)
matrixplot(dat)

par(mar=c(mar.default))  
par(cex=c(cex.default))

**Note Vit D serum levels and ESR are majority incomplete so can’t easily be used in later regression modeling. Other baseline blood results are about 10-12% missing and they appear to be missing in non-random way (e.g. some clinic sites more complete than others. This means deletion of cases with missing observations may introduce bias in regression modelling and suggests imputation of missing observations is more appropriate fix.

7. Univariate associations with PUR

vars <- c("age", "sex", "vitD.Rx", "ethnicity", "clinic",
          "lymphocytes","monocytes","neutrophils","haemoglobin","ESR","albumin","CRP",
          "vitD.baseline", "more1site",
          "microscopy.diagnosis","culture.diagnosis","histology.diagnosis", "basis.diagnosis",
          "baseline.steroid","other.immuno.drug","PUR")

nonnorm <- c("lymphocytes", "monocytes", "neutrophils", "haemoglobin", "ESR", 
             "albumin", "CRP", "age", "vitD.baseline")
catVars <- sapply(df, is.factor)
numVars <- sapply(df, is.numeric)
catVars <- names(df)[catVars]
numVars <- names(df)[numVars]

catVars <- dplyr::intersect(catVars, vars)

tbl2 <- tableone::CreateTableOne(data=df, vars=vars, strata = "PUR")

tbl2 <- print(tbl2, showAllLevels = FALSE, format = "f", 
              exact=catVars, nonnormal=nonnorm,
              quote = FALSE, noSpaces = TRUE, printToggle = FALSE)

Print table of anivariate associations:

kable(tbl2, format = "markdown")
No_PUR PUR p test
n 212 37
age (median [IQR]) 38.05 [29.95, 52.80] 30.00 [23.50, 42.30] 0.002 nonnorm
sex = male 131 27 0.267 exact
vitD.Rx = VitD.Rx 40 17 0.001 exact
ethnicity 0.374 exact
African 21 6
Middle.Eastern 2 0
SE.Asian 9 2
South.Asian 125 24
White.European 55 5
clinic 0.009 exact
A 92 12
B 49 13
C 50 3
D 21 9
lymphocytes (median [IQR]) 1.43 [1.10, 1.90] 1.14 [0.83, 1.51] 0.013 nonnorm
monocytes (median [IQR]) 0.60 [0.47, 0.80] 0.66 [0.40, 0.81] 0.649 nonnorm
neutrophils (median [IQR]) 4.48 [3.40, 5.93] 4.95 [3.35, 6.62] 0.696 nonnorm
haemoglobin (median [IQR]) 130.00 [116.00, 141.00] 124.00 [114.00, 137.25] 0.306 nonnorm
ESR (median [IQR]) 29.00 [13.00, 49.00] 32.50 [9.50, 61.25] 0.609 nonnorm
albumin (median [IQR]) 34.00 [30.00, 38.00] 34.00 [29.00, 37.50] 0.641 nonnorm
CRP (median [IQR]) 20.00 [4.93, 54.75] 55.00 [13.00, 72.50] 0.045 nonnorm
vitD.baseline (median [IQR]) 10.00 [7.00, 20.00] 7.00 [7.00, 10.00] 0.219 nonnorm
more1site = TRUE 76 20 0.044 exact
microscopy.diagnosis = positive 60 20 0.004 exact
culture.diagnosis = positive 125 28 0.067 exact
histology.diagnosis = positive 114 24 0.282 exact
basis.diagnosis = micro_or_histo_confirmed 186 36 0.146 exact
baseline.steroid = Yes 42 12 0.128 exact
other.immuno.drug = Yes 11 0 0.377 exact
PUR = PUR 0 37 <0.001 exact

8. Some multivariate plots

Particularly interested in two a priori suspected confounders of any relationship between vitamin D prescription and PUR: clinic site and ethnicity, so these visualised.

Plots show % of patients with PUR by vitamin D status and by the possible confounding variable. Error bars are standard error for proportion based on binomial distribution.

First, by ethnicity. Ethnicity categories were collapsed into three categories due to low frequencies for some of the categories used for primary data collection. Specifically, South East Asian and West Asian categories, which had few patients, were combined with South Asian.

dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity=="Middle.Eastern"] <- "Asian"
dat$ethnicity[dat$ethnicity=="SE.Asian"] <- "Asian"
dat$ethnicity[dat$ethnicity=="South.Asian"] <- "Asian"

XT <- xtabs(~ethnicity+PUR+vitD.Rx, data=dat)

# percentage PUR
percNVdAfr <- round((XT[4]/(XT[4]+XT[1]))*100, 1)
percNVdAsn <- round((XT[5]/(XT[5]+XT[2]))*100, 1)
percNVdWht <- round((XT[6]/(XT[6]+XT[3]))*100, 1)

percVdAfr <- round((XT[10]/(XT[10]+XT[7]))*100, 1)
percVdAsn <- round((XT[11]/(XT[11]+XT[8]))*100, 1)
percVdWht <- round((XT[12]/(XT[12]+XT[9]))*100, 1)

# as proportion for SE calcs
af1 <- (percNVdAfr/100)
as1 <- (percNVdAsn/100)
wh1 <- (percNVdWht/100)

af2 <- (percVdAfr/100)
as2 <- (percVdAsn/100)
wh2 <- (percVdWht/100)

# SE  = sqrt(pbar*(1-pbar)/n)
af1SE <- sqrt(af1*(1-af1)/(XT[4]+XT[1]))
as1SE <- sqrt(as1*(1-as1)/(XT[5]+XT[2]))
wh1SE <- sqrt(wh1*(1-wh1)/(XT[6]+XT[3]))

af2SE <- sqrt(af2*(1-af2)/(XT[7]+XT[10]))
as2SE <- sqrt(as2*(1-as2)/(XT[8]+XT[11]))
wh2SE <- sqrt(wh2*(1-wh2)/(XT[9]+XT[12]))

SE <- c(af1SE, as1SE, wh1SE, af2SE, as2SE, wh2SE)

Percent.with.PUR <- c(percNVdAfr, percVdAfr, percNVdAsn, percVdAsn, percNVdWht, percVdWht)

Ethnicity <- c("African", "African", "Asian", "Asian", "White.European", "White.European")

VitaminD <- rep(c("No vitamin D", "Vitamin D"), 3)

EthVit <- data.frame(Ethnicity, VitaminD, Percent.with.PUR, SE)

g <- ggplot(EthVit, aes(x=Ethnicity, y=Percent.with.PUR, fill=Ethnicity))

g  +  geom_errorbar(aes(ymin=Percent.with.PUR-(0.1),
                    ymax=Percent.with.PUR+(100*SE)),
                width=0.2, colour="black") + 
  geom_bar(stat="identity", colour="black") + facet_wrap(~VitaminD) + 
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  scale_fill_brewer(palette = 2) +
  ylim(0,100) + guides(fill=FALSE) + xlab("") +
  theme(strip.text=element_text(face="bold")) +
  ggtitle("Proportion PUR by ethnicity and Vit D status") +
  theme(plot.title=element_text(size=rel(1))) +
  ylab("Percent with PUR")

Both Vitamin D prescribing behaviour and PUR incidence differed across these ethnic groupings. Patients prescribed vitamin D had higher rates of PUR irrespective of ethnic grouping.

Clinic sites :

A.vd <- sum(df$clinic=="A" & df$vitD.Rx=="VitD.Rx")
A.vd.PUR <- sum(df$clinic=="A" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
A.nvd <- sum(df$clinic=="A" & df$vitD.Rx!="VitD.Rx")
A.nvd.PUR <- sum(df$clinic=="A" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

A.prop.vd <- A.vd.PUR/A.vd
A.prop.nvd <- A.nvd.PUR/A.nvd
A.prop.vd.SE <- sqrt(A.prop.vd*(1-A.prop.vd)/A.vd)
A.prop.nvd.SE <- sqrt(A.prop.nvd*(1-A.prop.nvd)/A.nvd) 

#

B.vd <- sum(df$clinic=="B" & df$vitD.Rx=="VitD.Rx")
B.vd.PUR <- sum(df$clinic=="B" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
B.nvd <- sum(df$clinic=="B" & df$vitD.Rx!="VitD.Rx")
B.nvd.PUR <- sum(df$clinic=="B" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

B.prop.vd <- B.vd.PUR/B.vd
B.prop.nvd <- B.nvd.PUR/B.nvd
B.prop.vd.SE <- sqrt(B.prop.vd*(1-B.prop.vd)/B.vd)
B.prop.nvd.SE <- sqrt(B.prop.nvd*(1-B.prop.nvd)/B.nvd) 

#

C.vd <- sum(df$clinic=="C" & df$vitD.Rx=="VitD.Rx")
C.vd.PUR <- sum(df$clinic=="C" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
C.nvd <- sum(df$clinic=="C" & df$vitD.Rx!="VitD.Rx")
C.nvd.PUR <- sum(df$clinic=="C" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

C.prop.vd <- C.vd.PUR/C.vd
C.prop.nvd <- C.nvd.PUR/C.nvd
C.prop.vd.SE <- sqrt(C.prop.vd*(1-C.prop.vd)/C.vd)
C.prop.nvd.SE <- sqrt(C.prop.nvd*(1-C.prop.nvd)/C.nvd) 

#

D.vd <- sum(df$clinic=="D" & df$vitD.Rx=="VitD.Rx")
D.vd.PUR <- sum(df$clinic=="D" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
D.nvd <- sum(df$clinic=="D" & df$vitD.Rx!="VitD.Rx")
D.nvd.PUR <- sum(df$clinic=="D" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

D.prop.vd <- D.vd.PUR/D.vd
D.prop.nvd <- D.nvd.PUR/D.nvd
D.prop.vd.SE <- sqrt(D.prop.vd*(1-D.prop.vd)/D.vd)
D.prop.nvd.SE <- sqrt(D.prop.nvd*(1-D.prop.nvd)/D.nvd) 

props <- c(A.prop.vd, A.prop.nvd, B.prop.vd, B.prop.nvd, C.prop.vd, C.prop.nvd, D.prop.vd, D.prop.nvd)
SE <- c(A.prop.vd.SE, A.prop.nvd.SE, B.prop.vd.SE, B.prop.nvd.SE, C.prop.vd.SE, C.prop.nvd.SE, D.prop.vd.SE, D.prop.nvd.SE)
clinic <- c("A", "A", "B", "B", "C", "C", "D", "D")
VitaminD <- rep(c("Vitamin D", "No Vitamin D"), 4)

clinics <- data.frame(clinic, VitaminD, props, SE)
clinics <- dplyr::mutate(clinics, props=100*props)

g <- ggplot(clinics, aes(x=clinic, y=props, fill=clinic))

g  +  geom_errorbar(aes(ymin=props-(0.1),
                        ymax=props+(100*SE)),
                    width=0.2, colour="black") + 
  geom_bar(stat="identity", colour="black") + facet_wrap(~VitaminD) + 
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  scale_fill_brewer(palette = 4) +
  ylim(0,100) + guides(fill=FALSE) + xlab("Clinic site") +
  theme(strip.text=element_text(face="bold")) +
  ggtitle("Proportion PUR by clinic and Vit D status") +
  theme(plot.title=element_text(size=rel(1))) +
  ylab("Percent with PUR")

Clinic site C is an outlier, with lower rates of PUR, lower rates of vitamin D prescription, and is the only clinic site where an increased rate of PUR is not seen for patients prescribed vitamin D. Only 11 patients at clinic C were prescribed vitamin D, and none had a PUR.

Now three way comparisons between PUR and all combinations of 2 variables from set {ethnicity, vitamin D, clinic site, age}.

g <- ggpairs(df,
             showStrips = TRUE,
             mapping = ggplot2::aes(
               color = PUR,
               alpha=0.5),
             columns = c("ethnicity","clinic", "vitD.Rx", "age"))
g + theme_grey(base_size = 8)

9. Adjusting for a priori specified potential confounders

Vitamin D remains associated with PUR after adjusting for ethnicity in a logistic regression model:

vars <- c("PUR","ethnicity","clinic","age", "sex",
          "more1site", "microscopy.diagnosis", "culture.diagnosis", "histology.diagnosis",
          "baseline.steroid", "other.immuno.drug", 
          "lymphocytes", "monocytes", "neutrophils", "haemoglobin", 
          "ESR", "albumin", "CRP","vitD.baseline","vitD.6mths", "vitD.Rx")

dat <- df[ ,vars]


# ethnicity collapsed into 3 categories
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity=="Middle.Eastern"] <- "Asian"
dat$ethnicity[dat$ethnicity=="SE.Asian"] <- "Asian"
dat$ethnicity[dat$ethnicity=="South.Asian"] <- "Asian"
# vit D remains significant after adjusting for ethnicity
summary(glm(PUR~vitD.Rx+ethnicity, data=dat, family="binomial"))
## 
## Call:
## glm(formula = PUR ~ vitD.Rx + ethnicity, family = "binomial", 
##     data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9602  -0.4903  -0.4903  -0.3761   2.3169  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.7451     0.5114  -3.412 0.000645 ***
## vitD.RxVitD.Rx            1.2100     0.3789   3.193 0.001407 ** 
## ethnicityAsian           -0.3128     0.5267  -0.594 0.552604    
## ethnicityWhite.European  -0.8683     0.6779  -1.281 0.200227    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 209.29  on 248  degrees of freedom
## Residual deviance: 195.93  on 245  degrees of freedom
## AIC: 203.93
## 
## Number of Fisher Scoring iterations: 5

Because there are 0 patients with vitamin D prescription and PUR at clinic C, a standard logistic regression model cannot be fit with clinic as a co-variable. Instead an exact logistic regression model is used here, and shows vitamin D association with PUR is overall independent of clinic site.

datClin <- dat
# collapse to binary clinic variable "clinic C" or "not clinic C"
# as C is the outlier
datClin$clinic <- as.character(datClin$clinic)
datClin$clinic[datClin$clinic!="C"] <- "nonC"

# making a collapsed data frame for exact regression with elrm package
x <- xtabs(~PUR + interaction(clinic, vitD.Rx), data = datClin)

cdat <- cdat <- data.frame(clinic = rep(1:0, 2), vitD.Rx = rep(0:1, each = 2), 
                           PUR = x[2, ], ntrials = colSums(x))

exact.m <- elrm(formula = PUR/ntrials ~ vitD.Rx + clinic, 
                interest = ~ vitD.Rx + clinic,
                iter = 5005000, dataset = cdat, burnIn = 5000, r = 2)
## Progress:   0%  Progress:   5%  Progress:  10%  Progress:  15%  Progress:  20%  Progress:  25%  Progress:  30%  Progress:  35%  Progress:  40%  Progress:  45%  Progress:  50%  Progress:  55%  Progress:  60%  Progress:  65%  Progress:  70%  Progress:  75%  Progress:  80%  Progress:  85%  Progress:  90%  Progress:  95%  Progress: 100%
summary(exact.m)
## 
## Call:
## [[1]]
## elrm(formula = PUR/ntrials ~ vitD.Rx + clinic, interest = ~vitD.Rx + 
##     clinic, r = 2, iter = 5005000, dataset = cdat, burnIn = 5000)
## 
## 
## Results:
##         estimate p-value p-value_se mc_size
## joint         NA 0.00031    0.00001 5000000
## vitD.Rx  1.34424 0.00103    0.00011   83724
## clinic  -1.54588 0.04047    0.00556    2545
## 
## 95% Confidence Intervals for Parameters
## 
##              lower       upper
## vitD.Rx  0.4707123 2.558588197
## clinic  -4.8906887 0.004469184

10. ‘Full’ logistic regression model

All variables significant at the <0.05 level included except clinic (due to zero cell problem). Ethnicity also included as important possible confounder for vit D and PUR. Because ~12% of lymphocyte count observations are missing, in a non-random way, these values are imputed using MCMC. Lymphocyte count is transformed to fit more normal distribution.

# imputation on a reduced data frame
set.seed(123)
vars <- c("PUR","ethnicity","clinic","age", "sex",
          "more1site", "microscopy.diagnosis", "vitD.Rx",
          "baseline.steroid", "other.immuno.drug", 
          "lymphocytes", "monocytes", "CRP")
dat <- df[ ,vars]
dat <- mice(dat)
## 
##  iter imp variable
##   1   1  lymphocytes  monocytes  CRP
##   1   2  lymphocytes  monocytes  CRP
##   1   3  lymphocytes  monocytes  CRP
##   1   4  lymphocytes  monocytes  CRP
##   1   5  lymphocytes  monocytes  CRP
##   2   1  lymphocytes  monocytes  CRP
##   2   2  lymphocytes  monocytes  CRP
##   2   3  lymphocytes  monocytes  CRP
##   2   4  lymphocytes  monocytes  CRP
##   2   5  lymphocytes  monocytes  CRP
##   3   1  lymphocytes  monocytes  CRP
##   3   2  lymphocytes  monocytes  CRP
##   3   3  lymphocytes  monocytes  CRP
##   3   4  lymphocytes  monocytes  CRP
##   3   5  lymphocytes  monocytes  CRP
##   4   1  lymphocytes  monocytes  CRP
##   4   2  lymphocytes  monocytes  CRP
##   4   3  lymphocytes  monocytes  CRP
##   4   4  lymphocytes  monocytes  CRP
##   4   5  lymphocytes  monocytes  CRP
##   5   1  lymphocytes  monocytes  CRP
##   5   2  lymphocytes  monocytes  CRP
##   5   3  lymphocytes  monocytes  CRP
##   5   4  lymphocytes  monocytes  CRP
##   5   5  lymphocytes  monocytes  CRP
dat <- mice::complete(dat)

dat2 <- dat # save for later propensity score matching analysis

# a model with all univariate p<0.05 
# + ethnicity as an important potential confounder
# but no 'clinic'

# ethnicity collapsed to one binary variable
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity!="White.European"] <- "nonWhite.European"

full.m <- glm(PUR~vitD.Rx+I(log(lymphocytes+1))+more1site+microscopy.diagnosis+I(log(CRP+1))+ethnicity+I(log(age)),
            data=dat, family="binomial")

summary(full.m)
## 
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + 
##     microscopy.diagnosis + I(log(CRP + 1)) + ethnicity + I(log(age)), 
##     family = "binomial", data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4251  -0.5680  -0.3811  -0.2253   2.8868  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)              3.38913    2.34359   1.446  0.14814   
## vitD.Rx2                 1.23256    0.40845   3.018  0.00255 **
## I(log(lymphocytes + 1)) -1.36939    0.76119  -1.799  0.07202 . 
## more1site2               0.50294    0.40036   1.256  0.20904   
## microscopy.diagnosis2    1.11245    0.39725   2.800  0.00510 **
## I(log(CRP + 1))          0.06174    0.16020   0.385  0.69995   
## ethnicityWhite.European -0.05363    0.60390  -0.089  0.92924   
## I(log(age))             -1.47126    0.59445  -2.475  0.01332 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 209.29  on 248  degrees of freedom
## Residual deviance: 172.78  on 241  degrees of freedom
## AIC: 188.78
## 
## Number of Fisher Scoring iterations: 5

In this ‘full’ model, age, microscopy diagnosis (AFB seen on diagnostic sample), and vitamin D prescription remain significant.

11. Stepwise logistic regression models

Next a backwards stepwise selection of variables based on Akaike Information Criteria (AIC) impact of removing the variable was performed:

step.m <- stepAIC(full.m, direction = "backward")
## Start:  AIC=188.78
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + ethnicity + I(log(age))
## 
##                           Df Deviance    AIC
## - ethnicity                1   172.78 186.78
## - I(log(CRP + 1))          1   172.93 186.93
## - more1site                1   174.35 188.35
## <none>                         172.78 188.78
## - I(log(lymphocytes + 1))  1   176.10 190.10
## - I(log(age))              1   179.15 193.15
## - microscopy.diagnosis     1   180.68 194.68
## - vitD.Rx                  1   181.67 195.67
## 
## Step:  AIC=186.78
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + I(log(age))
## 
##                           Df Deviance    AIC
## - I(log(CRP + 1))          1   172.94 184.94
## - more1site                1   174.37 186.37
## <none>                         172.78 186.78
## - I(log(lymphocytes + 1))  1   176.11 188.11
## - microscopy.diagnosis     1   180.69 192.69
## - I(log(age))              1   181.00 193.00
## - vitD.Rx                  1   181.72 193.72
## 
## Step:  AIC=184.94
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(age))
## 
##                           Df Deviance    AIC
## - more1site                1   174.69 184.69
## <none>                         172.94 184.94
## - I(log(lymphocytes + 1))  1   177.28 187.28
## - microscopy.diagnosis     1   180.83 190.83
## - I(log(age))              1   181.35 191.35
## - vitD.Rx                  1   181.98 191.98
## 
## Step:  AIC=184.69
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age))
## 
##                           Df Deviance    AIC
## <none>                         174.69 184.69
## - I(log(lymphocytes + 1))  1   180.08 188.08
## - microscopy.diagnosis     1   182.54 190.54
## - I(log(age))              1   183.61 191.61
## - vitD.Rx                  1   184.70 192.70
summary(step.m)
## 
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age)), family = "binomial", data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3473  -0.5534  -0.3772  -0.2367   2.8400  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)               4.1887     2.0252   2.068  0.03861 * 
## vitD.Rx2                  1.2901     0.4033   3.199  0.00138 **
## I(log(lymphocytes + 1))  -1.6228     0.7118  -2.280  0.02262 * 
## microscopy.diagnosis2     1.0974     0.3929   2.793  0.00522 **
## I(log(age))              -1.5270     0.5324  -2.868  0.00413 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 209.29  on 248  degrees of freedom
## Residual deviance: 174.69  on 244  degrees of freedom
## AIC: 184.69
## 
## Number of Fisher Scoring iterations: 5

In this model, age, microscopy diagnosis (AFB seen on diagnostic sample), vitamin D prescription, and lymphocyte count are significant.

This reduced model comapres favorably to the ‘full’ model. It has similar residual deviance and predictive performance in the data set, and is non-inferior in a -2 log likelihood test (with better AIC score):

PseudoR2(step.m)["Nagelkerke"]
## Nagelkerke 
##  0.2282075
PseudoR2(full.m)["Nagelkerke"]
## Nagelkerke 
##  0.2399237
PseudoR2(step.m)["AIC"]
##      AIC 
## 184.6892
PseudoR2(full.m)["AIC"]
##      AIC 
## 188.7761
step.m.p <- predict(step.m, type = "response")
full.m.p <- predict(full.m, type = "response")

roc(dat$PUR, step.m.p, data=dat)$auc
## Area under the curve: 0.7743
roc(dat$PUR, full.m.p, data=dat)$auc
## Area under the curve: 0.7815
anova(full.m, step.m, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + ethnicity + I(log(age))
## Model 2: PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age))
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       241     172.78                     
## 2       244     174.69 -3  -1.9131   0.5906

An automated search for an improved model including all possible two way interactions between age, microscopy diagnosis (AFB seen on diagnostic sample), vitamin D prescription, and lymphocyte count was performed. The best model including interaction terms was not significantly better than the previous stepwise model.

search <- step(step.m, ~.^2)
## Start:  AIC=184.69
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age))
## 
##                                                Df Deviance    AIC
## + I(log(lymphocytes + 1)):microscopy.diagnosis  1   172.56 184.56
## <none>                                              174.69 184.69
## + microscopy.diagnosis:I(log(age))              1   173.73 185.73
## + vitD.Rx:microscopy.diagnosis                  1   173.90 185.90
## + vitD.Rx:I(log(age))                           1   174.54 186.54
## + I(log(lymphocytes + 1)):I(log(age))           1   174.57 186.57
## + vitD.Rx:I(log(lymphocytes + 1))               1   174.68 186.68
## - I(log(lymphocytes + 1))                       1   180.08 188.08
## - microscopy.diagnosis                          1   182.54 190.54
## - I(log(age))                                   1   183.61 191.61
## - vitD.Rx                                       1   184.70 192.70
## 
## Step:  AIC=184.56
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age)) + I(log(lymphocytes + 1)):microscopy.diagnosis
## 
##                                                Df Deviance    AIC
## <none>                                              172.56 184.56
## - I(log(lymphocytes + 1)):microscopy.diagnosis  1   174.69 184.69
## + vitD.Rx:microscopy.diagnosis                  1   171.57 185.57
## + microscopy.diagnosis:I(log(age))              1   171.95 185.95
## + I(log(lymphocytes + 1)):I(log(age))           1   172.12 186.12
## + vitD.Rx:I(log(age))                           1   172.49 186.49
## + vitD.Rx:I(log(lymphocytes + 1))               1   172.54 186.54
## - I(log(age))                                   1   182.02 192.02
## - vitD.Rx                                       1   182.35 192.35
anova(search)
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: PUR
## 
## Terms added sequentially (first to last)
## 
## 
##                                              Df Deviance Resid. Df
## NULL                                                           248
## vitD.Rx                                       1  11.5120       247
## I(log(lymphocytes + 1))                       1   6.1153       246
## microscopy.diagnosis                          1   8.0510       245
## I(log(age))                                   1   8.9235       244
## I(log(lymphocytes + 1)):microscopy.diagnosis  1   2.1312       243
##                                              Resid. Dev
## NULL                                             209.29
## vitD.Rx                                          197.78
## I(log(lymphocytes + 1))                          191.66
## microscopy.diagnosis                             183.61
## I(log(age))                                      174.69
## I(log(lymphocytes + 1)):microscopy.diagnosis     172.56

12. Overly Influential cases

Using the stepwise model without interaction terms, cases with disproportionate influence were identified (defined as Cook’s distance > 4 standard deviations). These 7 cases were removed and full and stepwise models were rerun to give a final model.

summary(cooks.distance(step.m))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000094 0.0000883 0.0003998 0.0042930 0.0024280 0.0666300
cutoff <- sd(cooks.distance(step.m))*4
plot(cooks.distance(step.m))
abline(h=cutoff, col="darkgreen")

dat.reduced <- dat[cooks.distance(step.m)<cutoff,]

cooksAdj.full.m <- glm(PUR~vitD.Rx+I(log(lymphocytes+1))+more1site+microscopy.diagnosis+I(log(CRP+1))+ethnicity+I(log(age)),
              data=dat.reduced, family="binomial")

cooksAdj.step.m <- stepAIC(cooksAdj.full.m, direction = "backward")
## Start:  AIC=151.37
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + ethnicity + I(log(age))
## 
##                           Df Deviance    AIC
## - ethnicity                1   135.43 149.43
## - I(log(CRP + 1))          1   135.45 149.45
## <none>                         135.37 151.37
## - more1site                1   138.53 152.53
## - I(log(lymphocytes + 1))  1   140.89 154.89
## - vitD.Rx                  1   141.66 155.66
## - microscopy.diagnosis     1   144.31 158.31
## - I(log(age))              1   145.92 159.92
## 
## Step:  AIC=149.43
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + I(log(age))
## 
##                           Df Deviance    AIC
## - I(log(CRP + 1))          1   135.53 147.53
## <none>                         135.43 149.43
## - more1site                1   138.63 150.63
## - I(log(lymphocytes + 1))  1   140.96 152.96
## - vitD.Rx                  1   141.77 153.77
## - microscopy.diagnosis     1   144.31 156.31
## - I(log(age))              1   149.09 161.09
## 
## Step:  AIC=147.53
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(age))
## 
##                           Df Deviance    AIC
## <none>                         135.53 147.53
## - more1site                1   138.94 148.94
## - vitD.Rx                  1   141.91 151.91
## - I(log(lymphocytes + 1))  1   142.33 152.33
## - microscopy.diagnosis     1   144.36 154.36
## - I(log(age))              1   149.50 159.50
final.m <- cooksAdj.step.m

13. Description final model

summary(final.m)
## 
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + 
##     microscopy.diagnosis + I(log(age)), family = "binomial", 
##     data = dat.reduced)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5126  -0.4609  -0.2795  -0.1544   2.5752  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               6.7808     2.6035   2.605 0.009200 ** 
## vitD.Rx2                  1.1874     0.4646   2.556 0.010588 *  
## I(log(lymphocytes + 1))  -2.2648     0.9027  -2.509 0.012108 *  
## more1site2                0.8309     0.4533   1.833 0.066827 .  
## microscopy.diagnosis2     1.3175     0.4494   2.932 0.003368 ** 
## I(log(age))              -2.3408     0.6793  -3.446 0.000569 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 181.38  on 241  degrees of freedom
## Residual deviance: 135.53  on 236  degrees of freedom
## AIC: 147.53
## 
## Number of Fisher Scoring iterations: 6
with(final.m, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 9.716045e-09
final.m.p <- predict(final.m, type = "response")
roc(dat.reduced$PUR, final.m.p, data=dat.reduced)$auc
## Area under the curve: 0.8437
PseudoR2(final.m)["Nagelkerke"]
## Nagelkerke 
##  0.3272972
PseudoR2(final.m)["AIC"]
##      AIC 
## 147.5254

14. Propensity score matching analysis

As seen above clinic site was related to both probability of vitamin D prescribing and probability of PUR occurance (hence a significant risk is a confounder), but could not be included in the final logistic regression model due to zero cell occurence. As an additional check that results are robust despite this problem, here a propensity score analysis is performed where cases are matched on propensity for vitamin D to be prescribed, using covariates that were related to both vitamin D prescription and PUR outcome. Using the (smaller) matched dataset an assocation between vitamin D prescribing and PUR is still found.

### *** uses a earlier version of dat before ethnicity collapsed to binary

# how much do the co-factors influence vitamin D Rx propensity / probability?
vitDRx.m <- glm(vitD.Rx~age+ethnicity+clinic, data=dat2,
                family="binomial")
summary(vitDRx.m)
## 
## Call:
## glm(formula = vitD.Rx ~ age + ethnicity + clinic, family = "binomial", 
##     data = dat2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5549  -0.6141  -0.4020  -0.1956   2.2777  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.64112    0.77109  -3.425 0.000614 ***
## age         -0.00149    0.01308  -0.114 0.909280    
## ethnicity2   1.84059    1.62751   1.131 0.258089    
## ethnicity3  -1.36501    1.18718  -1.150 0.250228    
## ethnicity4   0.21478    0.50573   0.425 0.671056    
## ethnicity5  -1.19302    0.66990  -1.781 0.074929 .  
## clinic2      2.33300    0.48413   4.819 1.44e-06 ***
## clinic3      1.72506    0.55254   3.122 0.001796 ** 
## clinic4      2.94495    0.57766   5.098 3.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 267.91  on 248  degrees of freedom
## Residual deviance: 216.06  on 240  degrees of freedom
## AIC: 234.06
## 
## Number of Fisher Scoring iterations: 5
library(BaylorEdPsych)
PseudoR2(vitDRx.m)["Nagelkerke"]
## Nagelkerke 
##  0.2852224
vitDRx.m.p <- predict(vitDRx.m, type = "response")
library(pROC)
roc(dat2$vitD.Rx, vitDRx.m.p, data=dat2)$auc
## Area under the curve: 0.7908
### propensity score matching with MatchIt 
library(MatchIt)
# grouping variable is dat$vitD.Rx
# matching variables chosen : age, clinic, ethnicity

# needs binary or boolean grouping variable
dat2$vitD.Rx <- as.character(dat2$vitD.Rx)
dat2$vitD.Rx[dat2$vitD.Rx=="Nil.vitD.Rx"] <- 0
dat2$vitD.Rx[dat2$vitD.Rx=="VitD.Rx"] <- 1
dat2$vitD.Rx <- as.numeric(dat2$vitD.Rx)

#matching treatments:controls in 1:1 ratio
m.out <- matchit(vitD.Rx ~ age + clinic + ethnicity,
                data = dat2, method = "nearest", ratio = 1) 
summary(m.out) 
## 
## Call:
## matchit(formula = vitD.Rx ~ age + clinic + ethnicity, data = dat2, 
##     method = "nearest", ratio = 1)
## 
## Summary of balance for all data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.1824     0.1708    0.2032  0.2024
## age              37.3158       41.9333    16.6348   -4.6175  4.7000
## clinicA           0.1228        0.5052     0.5013   -0.3824  0.0000
## clinicB           0.4211        0.1979     0.3995    0.2231  0.0000
## clinicC           0.1930        0.2188     0.4145   -0.0258  0.0000
## clinicD           0.2632        0.0781     0.2691    0.1850  0.0000
## ethnicity2        0.0175        0.0052     0.0722    0.0123  0.0000
## ethnicity3        0.0175        0.0521     0.2228   -0.0345  0.0000
## ethnicity4        0.6842        0.5729     0.4959    0.1113  0.0000
## ethnicity5        0.1228        0.2760     0.4482   -0.1532  0.0000
##            eQQ Mean eQQ Max
## distance     0.2056  0.3775
## age          4.8105 13.5000
## clinicA      0.3860  1.0000
## clinicB      0.2281  1.0000
## clinicC      0.0351  1.0000
## clinicD      0.1754  1.0000
## ethnicity2   0.0000  0.0000
## ethnicity3   0.0351  1.0000
## ethnicity4   0.1228  1.0000
## ethnicity5   0.1579  1.0000
## 
## 
## Summary of balance for matched data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.3750     0.1688    0.0106  0.0013
## age              37.3158       37.0596    13.8588    0.2561  1.4000
## clinicA           0.1228        0.1053     0.3096    0.0175  0.0000
## clinicB           0.4211        0.4386     0.5006   -0.0175  0.0000
## clinicC           0.1930        0.2456     0.4343   -0.0526  0.0000
## clinicD           0.2632        0.2105     0.4113    0.0526  0.0000
## ethnicity2        0.0175        0.0000     0.0000    0.0175  0.0000
## ethnicity3        0.0175        0.0175     0.1325    0.0000  0.0000
## ethnicity4        0.6842        0.7368     0.4443   -0.0526  0.0000
## ethnicity5        0.1228        0.1228     0.3311    0.0000  0.0000
##            eQQ Mean eQQ Max
## distance     0.0111   0.096
## age          2.0737   7.500
## clinicA      0.0175   1.000
## clinicB      0.0175   1.000
## clinicC      0.0526   1.000
## clinicD      0.0526   1.000
## ethnicity2   0.0175   1.000
## ethnicity3   0.0000   0.000
## ethnicity4   0.0526   1.000
## ethnicity5   0.0000   0.000
## 
## Percent Balance Improvement:
##            Mean Diff. eQQ Med eQQ Mean  eQQ Max
## distance      94.7811 99.3405  94.6230  74.5751
## age           94.4529 70.2128  56.8928  44.4444
## clinicA       95.4122  0.0000  95.4545   0.0000
## clinicB       92.1376  0.0000  92.3077   0.0000
## clinicC     -104.2553  0.0000 -50.0000   0.0000
## clinicD       71.5556  0.0000  70.0000   0.0000
## ethnicity2   -42.2222  0.0000     -Inf     -Inf
## ethnicity3   100.0000  0.0000 100.0000 100.0000
## ethnicity4    52.7094  0.0000  57.1429   0.0000
## ethnicity5   100.0000  0.0000 100.0000 100.0000
## 
## Sample sizes:
##           Control Treated
## All           192      57
## Matched        57      57
## Unmatched     135       0
## Discarded       0       0
# plot(m.out, type = "jitter")
plot(m.out, type = "hist")

# extract the matched data
m.data1 <- match.data(m.out)

# check vit D Rx still associated with PUR after matching
matched.m <- glm(PUR~vitD.Rx, data=m.data1, family="binomial")
summary(matched.m)
## 
## Call:
## glm(formula = PUR ~ vitD.Rx, family = "binomial", data = m.data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8416  -0.8416  -0.5500  -0.5500   1.9817  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8124     0.3813  -4.753 2.01e-06 ***
## vitD.Rx       0.9567     0.4788   1.998   0.0457 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 119.93  on 113  degrees of freedom
## Residual deviance: 115.71  on 112  degrees of freedom
## AIC: 119.71
## 
## Number of Fisher Scoring iterations: 4
exp(confint(matched.m))
##                  2.5 %    97.5 %
## (Intercept) 0.07150922 0.3254047
## vitD.Rx     1.04469433 6.9629099
exp(coef(matched.m))
## (Intercept)     vitD.Rx 
##   0.1632653   2.6031250
#matching treatments:controls in 1:2 ratio
m.out <- matchit(vitD.Rx ~ age + clinic + ethnicity,
                data = dat2, method = "nearest", ratio = 2) 
summary(m.out) 
## 
## Call:
## matchit(formula = vitD.Rx ~ age + clinic + ethnicity, data = dat2, 
##     method = "nearest", ratio = 2)
## 
## Summary of balance for all data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.1824     0.1708    0.2032  0.2024
## age              37.3158       41.9333    16.6348   -4.6175  4.7000
## clinicA           0.1228        0.5052     0.5013   -0.3824  0.0000
## clinicB           0.4211        0.1979     0.3995    0.2231  0.0000
## clinicC           0.1930        0.2188     0.4145   -0.0258  0.0000
## clinicD           0.2632        0.0781     0.2691    0.1850  0.0000
## ethnicity2        0.0175        0.0052     0.0722    0.0123  0.0000
## ethnicity3        0.0175        0.0521     0.2228   -0.0345  0.0000
## ethnicity4        0.6842        0.5729     0.4959    0.1113  0.0000
## ethnicity5        0.1228        0.2760     0.4482   -0.1532  0.0000
##            eQQ Mean eQQ Max
## distance     0.2056  0.3775
## age          4.8105 13.5000
## clinicA      0.3860  1.0000
## clinicB      0.2281  1.0000
## clinicC      0.0351  1.0000
## clinicD      0.1754  1.0000
## ethnicity2   0.0000  0.0000
## ethnicity3   0.0351  1.0000
## ethnicity4   0.1228  1.0000
## ethnicity5   0.1579  1.0000
## 
## 
## Summary of balance for matched data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.2654     0.1782    0.1202  0.1427
## age              37.3158       42.1219    17.5984   -4.8061  5.8000
## clinicA           0.1228        0.1667     0.3743   -0.0439  0.0000
## clinicB           0.4211        0.3333     0.4735    0.0877  0.0000
## clinicC           0.1930        0.3684     0.4845   -0.1754  0.0000
## clinicD           0.2632        0.1316     0.3395    0.1316  0.0000
## ethnicity2        0.0175        0.0088     0.0937    0.0088  0.0000
## ethnicity3        0.0175        0.0526     0.2243   -0.0351  0.0000
## ethnicity4        0.6842        0.5088     0.5021    0.1754  0.0000
## ethnicity5        0.1228        0.3246     0.4703   -0.2018  0.0000
##            eQQ Mean eQQ Max
## distance     0.1236  0.2842
## age          5.3386 12.2000
## clinicA      0.0351  1.0000
## clinicB      0.0877  1.0000
## clinicC      0.1754  1.0000
## clinicD      0.1404  1.0000
## ethnicity2   0.0000  0.0000
## ethnicity3   0.0351  1.0000
## ethnicity4   0.1754  1.0000
## ethnicity5   0.1930  1.0000
## 
## Percent Balance Improvement:
##            Mean Diff.  eQQ Med  eQQ Mean eQQ Max
## distance      40.8360  29.5173   39.8790 24.7025
## age           -4.0843 -23.4043  -10.9774  9.6296
## clinicA       88.5305   0.0000   90.9091  0.0000
## clinicB       60.6880   0.0000   61.5385  0.0000
## clinicC     -580.8511   0.0000 -400.0000  0.0000
## clinicD       28.8889   0.0000   20.0000  0.0000
## ethnicity2    28.8889   0.0000    0.0000  0.0000
## ethnicity3    -1.5873   0.0000    0.0000  0.0000
## ethnicity4   -57.6355   0.0000  -42.8571  0.0000
## ethnicity5   -31.6637   0.0000  -22.2222  0.0000
## 
## Sample sizes:
##           Control Treated
## All           192      57
## Matched       114      57
## Unmatched      78       0
## Discarded       0       0
# plot(m.out, type = "jitter")
plot(m.out, type = "hist")

# extract the matched data
m.data1 <- match.data(m.out)

# check vit D Rx still associated with PUR after matching
matched.m <- glm(PUR~vitD.Rx, data=m.data1, family="binomial")
summary(matched.m)
## 
## Call:
## glm(formula = PUR ~ vitD.Rx, family = "binomial", data = m.data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8416  -0.5312  -0.5312  -0.5312   2.0140  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8871     0.2771  -6.811  9.7e-12 ***
## vitD.Rx       1.0314     0.4007   2.574   0.0101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 164.86  on 170  degrees of freedom
## Residual deviance: 158.25  on 169  degrees of freedom
## AIC: 162.25
## 
## Number of Fisher Scoring iterations: 4
exp(confint(matched.m))
##                 2.5 %    97.5 %
## (Intercept) 0.0845356 0.2524428
## vitD.Rx     1.2801606 6.2206071
exp(coef(matched.m))
## (Intercept)     vitD.Rx 
##   0.1515152   2.8050000