EPTB_PUR

1. Required packages

#packages
library(RCurl)
library(knitr)
library(elrm)
library(psych)
library(survival)
library(mice)
library(VIM)
library(MASS)
library(BaylorEdPsych)
library(pROC)
library(GGally)
library(tableone)
library(dplyr)
library(ggplot2)

2. Load data and create derived variables

# data read in
url <- "https://raw.githubusercontent.com/davidadambarr/EPTB_PUR/master/PURTB_raw.csv"
x <- getURL(url)
df <- read.csv(text = x)

### coerce data types and clean data
#dates
format <- "%d/%m/%Y"
df$PUR.date <- as.Date(as.character(df$PUR.date),format)
df$Date.TB.Rx.start <- as.Date(as.character(df$Date.TB.Rx.start),format)
df$Date.TB.Rx.finish <- as.Date(as.character(df$Date.TB.Rx.finish),format)
df$VitD.Rx.Date <- as.Date(as.character(df$VitD.Rx.Date),format)

# below limit of detection values
df$CRP <- as.character(df$CRP)
df$CRP[df$CRP=="<0.2"] <- "0.2"
df$CRP[df$CRP=="<3"] <- "3"
df$CRP <- as.numeric(df$CRP)

### create 'timing' variables
df$PUR.time <- as.numeric(df$PUR.date - df$Date.TB.Rx.start)

### create a 'number of sites' variable
df <- dplyr::mutate(df, NumSites=pleural+internal.LN+external.LN+CNS+BJI+pericardial+abdominal+other.site)
df <- dplyr::mutate(df, more1site=NumSites>1|miliary==1)
df$more1site <- as.factor(df$more1site)

3. Summary of data by Vit D prescription status (table 1)

### Table 1
df <- dplyr::select(df, -number)

vars <- c("age", "sex", "PUR", "vitD.Rx", "ethnicity", "clinic",
          "lymphocytes","monocytes","neutrophils","haemoglobin","ESR","albumin","CRP",
          "vitD.baseline","vitD.6mths","vitD.12mths",
          "pleural","internal.LN","external.LN","CNS", "BJI","pericardial","abdominal","miliary","other.site",
          "microscopy.diagnosis","culture.diagnosis","histology.diagnosis", "basis.diagnosis", "more1site",
          "baseline.steroid","other.immuno.drug",
          "PUR.Rx","ADR","prior.hypercalcaemia","prior.hypocalcaemia","calcaemia_during.TBRx")

# numeric site variables to factors
siteVars <- c("pleural","internal.LN","external.LN","CNS", "BJI","pericardial","abdominal","miliary","other.site")
df[ ,siteVars] <- lapply(df[ ,siteVars], as.factor)

catVars <- sapply(df, is.factor)
numVars <- sapply(df, is.numeric)
catVars <- names(df)[catVars]
numVars <- names(df)[numVars]


# nonparametric and exact tests for p values

# overall cohort
tbl1.2 <- tableone::CreateTableOne(data=df, vars=vars)
tbl1.2 <- print(tbl1.2, showAllLevels = FALSE, format = "f", 
              nonnormal=numVars,
              quote = FALSE, noSpaces = TRUE, printToggle = FALSE)

# stratified by vit D prescription status
tbl1 <- tableone::CreateTableOne(data=df, vars=vars, strata = "vitD.Rx")
tbl1 <- print(tbl1, showAllLevels = FALSE, format = "f", 
              exact=catVars, nonnormal=numVars,
              quote = FALSE, noSpaces = TRUE, printToggle = FALSE)

Printing tables. First is the overall cohort description:

kable(tbl1.2, format = "markdown")

	Overall
n	249
age (median [IQR])	36.00 [28.90, 50.70]
sex = male	158
PUR = PUR	37
vitD.Rx = VitD.Rx	57
ethnicity
African	27
Middle.Eastern	2
SE.Asian	11
South.Asian	149
White.European	60
clinic
A	104
B	62
C	53
D	30
lymphocytes (median [IQR])	1.40 [1.02, 1.90]
monocytes (median [IQR])	0.60 [0.43, 0.80]
neutrophils (median [IQR])	4.50 [3.40, 6.08]
haemoglobin (median [IQR])	130.00 [115.00, 140.00]
ESR (median [IQR])	29.00 [13.00, 51.00]
albumin (median [IQR])	34.00 [30.00, 38.00]
CRP (median [IQR])	23.00 [5.00, 65.00]
vitD.baseline (median [IQR])	10.00 [7.00, 19.00]
vitD.6mths (median [IQR])	26.00 [13.00, 44.75]
vitD.12mths (median [IQR])	7.00 [7.00, 17.75]
pleural = 1	48
internal.LN = 1	134
external.LN = 1	96
CNS = 1	6
BJI = 1	36
pericardial = 1	9
abdominal = 1	47
miliary = 1	2
other.site = 1	22
microscopy.diagnosis = positive	80
culture.diagnosis = positive	153
histology.diagnosis = positive	138
basis.diagnosis = micro_or_histo_confirmed	222
more1site = TRUE	96
baseline.steroid = Yes	54
other.immuno.drug = Yes	11
PUR.Rx
Extended.course.TB.therapy	5
None	12
NSAID	2
Oral.Steroid	11
Percutaneous.Aspiration	2
Percutaneous.Aspiration.Oral.Steroid	3
Surgery	1
ADR = No_recorded_ADR	182
prior.hypercalcaemia = Prior_hypercalcaemia	15
prior.hypocalcaemia = Prior_hypocalcaemia	7
calcaemia_during.TBRx
hypercalcaemic	10
hypocalcaemia	4
normocalcaemic	192

Second, whole cohort description stratified by vitamin D prescription status:

kable(tbl1, format = "markdown")

	Nil.vitD.Rx	VitD.Rx	p	test
n	192	57
age (median [IQR])	38.65 [29.48, 52.33]	32.70 [26.80, 43.40]	0.055	nonnorm
sex = male	123	35	0.755	exact
PUR = PUR	20	17	0.001	exact
vitD.Rx = VitD.Rx	0	57	<0.001	exact
ethnicity			0.039	exact
African	18	9
Middle.Eastern	1	1
SE.Asian	10	1
South.Asian	110	39
White.European	53	7
clinic			<0.001	exact
A	97	7
B	38	24
C	42	11
D	15	15
lymphocytes (median [IQR])	1.40 [1.04, 1.90]	1.34 [0.97, 1.82]	0.592	nonnorm
monocytes (median [IQR])	0.61 [0.50, 0.80]	0.56 [0.40, 0.80]	0.177	nonnorm
neutrophils (median [IQR])	4.55 [3.40, 5.90]	4.41 [3.38, 6.78]	0.665	nonnorm
haemoglobin (median [IQR])	130.00 [116.00, 141.00]	125.50 [111.00, 137.00]	0.223	nonnorm
ESR (median [IQR])	33.00 [13.00, 50.00]	26.50 [10.00, 61.25]	0.944	nonnorm
albumin (median [IQR])	35.00 [30.00, 38.00]	33.00 [28.00, 37.00]	0.131	nonnorm
CRP (median [IQR])	21.00 [4.60, 64.75]	29.00 [6.00, 65.00]	0.505	nonnorm
vitD.baseline (median [IQR])	16.00 [7.00, 26.00]	7.00 [7.00, 10.00]	0.004	nonnorm
vitD.6mths (median [IQR])	24.00 [18.75, 28.50]	26.00 [9.25, 58.50]	0.846	nonnorm
vitD.12mths (median [IQR])	8.50 [7.00, 17.75]	7.00 [7.00, 15.75]	0.817	nonnorm
pleural = 1	39	9	0.567	exact
internal.LN = 1	103	31	1.000	exact
external.LN = 1	70	26	0.219	exact
CNS = 1	4	2	0.623	exact
BJI = 1	21	15	0.009	exact
pericardial = 1	6	3	0.432	exact
abdominal = 1	34	13	0.441	exact
miliary = 1	1	1	0.406	exact
other.site = 1	17	5	1.000	exact
microscopy.diagnosis = positive	62	18	1.000	exact
culture.diagnosis = positive	111	42	0.031	exact
histology.diagnosis = positive	103	35	0.363	exact
basis.diagnosis = micro_or_histo_confirmed	167	55	0.051	exact
more1site = TRUE	70	26	0.219	exact
baseline.steroid = Yes	42	12	1.000	exact
other.immuno.drug = Yes	10	1	0.465	exact
PUR.Rx			0.058	exact
Extended.course.TB.therapy	5	0
None	6	6
NSAID	0	2
Oral.Steroid	4	7
Percutaneous.Aspiration	2	0
Percutaneous.Aspiration.Oral.Steroid	1	2
Surgery	1	0
ADR = No_recorded_ADR	140	42	0.864	exact
prior.hypercalcaemia = Prior_hypercalcaemia	14	1	0.203	exact
prior.hypocalcaemia = Prior_hypocalcaemia	4	3	0.362	exact
calcaemia_during.TBRx			0.879	exact
hypercalcaemic	7	3
hypocalcaemia	3	1
normocalcaemic	144	48

Note that there are several potential confounders including clinic site and ethnicity as predicted a priori.

4. PUR descrition

Distribution of timing first PUR event in those patients who experienced a PUR.

# timing
summary(df$PUR.time)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    10.0    27.0    52.5   100.4   167.8   504.0     213

ggplot(df, aes(x=PUR.time)) + geom_histogram() + ggtitle("Timing of first PUR, days from start TB Rx")

ggplot(df, aes(x=log(PUR.time))) + geom_histogram(bins=10)+
  ggtitle("Timing of first PUR, log(days)")

time <- df$PUR.time
event <- as.numeric(df$PUR=="PUR")
# looking at PUR positive patients only:
kmsurvival <- survfit(Surv(time,event) ~ 1)
plot(kmsurvival, xlab="Time in days from start RHZE",ylab="Proportion no PUR")
title(main = "Timing of PURs")

Treatment modalities used for PUR cases.

# treatment of PUR
table(df$PUR.Rx)

## 
##           Extended.course.TB.therapy                                 None 
##                                    5                                   12 
##                                NSAID                         Oral.Steroid 
##                                    2                                   11 
##              Percutaneous.Aspiration Percutaneous.Aspiration.Oral.Steroid 
##                                    2                                    3 
##                              Surgery 
##                                    1

5. Distribution of numerical variables

Several of the distributions look more normal after transformation. For example, lymphocyte count:

par(mfrow=c(2,2))
hist(df$lymphocytes, main = "Lymphocyte count histogram")
qqnorm(df$lymphocytes, main = "Lymphocyte count QQ plot")
hist(log(df$lymphocytes+1), main = "After transformation histogram")
qqnorm(log(df$lymphocytes+1), main = "After transformation QQ plot")

par(mfrow=c(1,1))

6. Missing data

Here, proportion of observations missing is plotted by variable:

vars <- c("PUR","ethnicity","clinic","age", "sex",
          "more1site",
          "microscopy.diagnosis", "culture.diagnosis", "histology.diagnosis",
          "baseline.steroid", "other.immuno.drug", 
          "lymphocytes", "monocytes", "neutrophils", "haemoglobin", 
          "ESR", "albumin", "CRP","vitD.baseline","vitD.6mths", "vitD.Rx")
dat <- df[ ,vars]

# check for missingness by variable
missing <- data.frame(colSums(is.na(dat)))
vars <- rownames(missing)
missing <- cbind(missing, vars)
n <- nrow(dat)
names(missing) <- c("n_missing", "vars")
missing$vars <- as.character(missing$vars)
missing <- dplyr::mutate(missing, prop_missing=n_missing/n)
p2 <- ggplot(missing, aes(x=reorder(vars, -prop_missing), y=prop_missing))
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=90, hjust=1)) +
  ggtitle("Proportion missing by variable")

Here, all the data is visualised by variable (columns) and by patient (rows). Higher values are a darker greyscale shade, absence of a binary variable is white and presence of a binary variable is black. Missing observations are coloured red.

mar.default <- par("mar")
cex.default <- par("cex")
par(mar=c(12, 4.1,4.1,2.1), cex=0.7)
matrixplot(dat)

par(mar=c(mar.default))  
par(cex=c(cex.default))

**Note Vit D serum levels and ESR are majority incomplete so can’t easily be used in later regression modeling. Other baseline blood results are about 10-12% missing and they appear to be missing in non-random way (e.g. some clinic sites more complete than others. This means deletion of cases with missing observations may introduce bias in regression modelling and suggests imputation of missing observations is more appropriate fix.

7. Univariate associations with PUR

vars <- c("age", "sex", "vitD.Rx", "ethnicity", "clinic",
          "lymphocytes","monocytes","neutrophils","haemoglobin","ESR","albumin","CRP",
          "vitD.baseline", "more1site",
          "microscopy.diagnosis","culture.diagnosis","histology.diagnosis", "basis.diagnosis",
          "baseline.steroid","other.immuno.drug","PUR")

nonnorm <- c("lymphocytes", "monocytes", "neutrophils", "haemoglobin", "ESR", 
             "albumin", "CRP", "age", "vitD.baseline")
catVars <- sapply(df, is.factor)
numVars <- sapply(df, is.numeric)
catVars <- names(df)[catVars]
numVars <- names(df)[numVars]

catVars <- dplyr::intersect(catVars, vars)

tbl2 <- tableone::CreateTableOne(data=df, vars=vars, strata = "PUR")

tbl2 <- print(tbl2, showAllLevels = FALSE, format = "f", 
              exact=catVars, nonnormal=nonnorm,
              quote = FALSE, noSpaces = TRUE, printToggle = FALSE)

Print table of anivariate associations:

kable(tbl2, format = "markdown")

	No_PUR	PUR	p	test
n	212	37
age (median [IQR])	38.05 [29.95, 52.80]	30.00 [23.50, 42.30]	0.002	nonnorm
sex = male	131	27	0.267	exact
vitD.Rx = VitD.Rx	40	17	0.001	exact
ethnicity			0.374	exact
African	21	6
Middle.Eastern	2	0
SE.Asian	9	2
South.Asian	125	24
White.European	55	5
clinic			0.009	exact
A	92	12
B	49	13
C	50	3
D	21	9
lymphocytes (median [IQR])	1.43 [1.10, 1.90]	1.14 [0.83, 1.51]	0.013	nonnorm
monocytes (median [IQR])	0.60 [0.47, 0.80]	0.66 [0.40, 0.81]	0.649	nonnorm
neutrophils (median [IQR])	4.48 [3.40, 5.93]	4.95 [3.35, 6.62]	0.696	nonnorm
haemoglobin (median [IQR])	130.00 [116.00, 141.00]	124.00 [114.00, 137.25]	0.306	nonnorm
ESR (median [IQR])	29.00 [13.00, 49.00]	32.50 [9.50, 61.25]	0.609	nonnorm
albumin (median [IQR])	34.00 [30.00, 38.00]	34.00 [29.00, 37.50]	0.641	nonnorm
CRP (median [IQR])	20.00 [4.93, 54.75]	55.00 [13.00, 72.50]	0.045	nonnorm
vitD.baseline (median [IQR])	10.00 [7.00, 20.00]	7.00 [7.00, 10.00]	0.219	nonnorm
more1site = TRUE	76	20	0.044	exact
microscopy.diagnosis = positive	60	20	0.004	exact
culture.diagnosis = positive	125	28	0.067	exact
histology.diagnosis = positive	114	24	0.282	exact
basis.diagnosis = micro_or_histo_confirmed	186	36	0.146	exact
baseline.steroid = Yes	42	12	0.128	exact
other.immuno.drug = Yes	11	0	0.377	exact
PUR = PUR	0	37	<0.001	exact

8. Some multivariate plots

Particularly interested in two a priori suspected confounders of any relationship between vitamin D prescription and PUR: clinic site and ethnicity, so these visualised.

Plots show % of patients with PUR by vitamin D status and by the possible confounding variable. Error bars are standard error for proportion based on binomial distribution.

First, by ethnicity. Ethnicity categories were collapsed into three categories due to low frequencies for some of the categories used for primary data collection. Specifically, South East Asian and West Asian categories, which had few patients, were combined with South Asian.

dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity=="Middle.Eastern"] <- "Asian"
dat$ethnicity[dat$ethnicity=="SE.Asian"] <- "Asian"
dat$ethnicity[dat$ethnicity=="South.Asian"] <- "Asian"

XT <- xtabs(~ethnicity+PUR+vitD.Rx, data=dat)

# percentage PUR
percNVdAfr <- round((XT[4]/(XT[4]+XT[1]))*100, 1)
percNVdAsn <- round((XT[5]/(XT[5]+XT[2]))*100, 1)
percNVdWht <- round((XT[6]/(XT[6]+XT[3]))*100, 1)

percVdAfr <- round((XT[10]/(XT[10]+XT[7]))*100, 1)
percVdAsn <- round((XT[11]/(XT[11]+XT[8]))*100, 1)
percVdWht <- round((XT[12]/(XT[12]+XT[9]))*100, 1)

# as proportion for SE calcs
af1 <- (percNVdAfr/100)
as1 <- (percNVdAsn/100)
wh1 <- (percNVdWht/100)

af2 <- (percVdAfr/100)
as2 <- (percVdAsn/100)
wh2 <- (percVdWht/100)

# SE  = sqrt(pbar*(1-pbar)/n)
af1SE <- sqrt(af1*(1-af1)/(XT[4]+XT[1]))
as1SE <- sqrt(as1*(1-as1)/(XT[5]+XT[2]))
wh1SE <- sqrt(wh1*(1-wh1)/(XT[6]+XT[3]))

af2SE <- sqrt(af2*(1-af2)/(XT[7]+XT[10]))
as2SE <- sqrt(as2*(1-as2)/(XT[8]+XT[11]))
wh2SE <- sqrt(wh2*(1-wh2)/(XT[9]+XT[12]))

SE <- c(af1SE, as1SE, wh1SE, af2SE, as2SE, wh2SE)

Percent.with.PUR <- c(percNVdAfr, percVdAfr, percNVdAsn, percVdAsn, percNVdWht, percVdWht)

Ethnicity <- c("African", "African", "Asian", "Asian", "White.European", "White.European")

VitaminD <- rep(c("No vitamin D", "Vitamin D"), 3)

EthVit <- data.frame(Ethnicity, VitaminD, Percent.with.PUR, SE)

g <- ggplot(EthVit, aes(x=Ethnicity, y=Percent.with.PUR, fill=Ethnicity))

g  +  geom_errorbar(aes(ymin=Percent.with.PUR-(0.1),
                    ymax=Percent.with.PUR+(100*SE)),
                width=0.2, colour="black") + 
  geom_bar(stat="identity", colour="black") + facet_wrap(~VitaminD) + 
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  scale_fill_brewer(palette = 2) +
  ylim(0,100) + guides(fill=FALSE) + xlab("") +
  theme(strip.text=element_text(face="bold")) +
  ggtitle("Proportion PUR by ethnicity and Vit D status") +
  theme(plot.title=element_text(size=rel(1))) +
  ylab("Percent with PUR")

Both Vitamin D prescribing behaviour and PUR incidence differed across these ethnic groupings. Patients prescribed vitamin D had higher rates of PUR irrespective of ethnic grouping.

Clinic sites :

A.vd <- sum(df$clinic=="A" & df$vitD.Rx=="VitD.Rx")
A.vd.PUR <- sum(df$clinic=="A" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
A.nvd <- sum(df$clinic=="A" & df$vitD.Rx!="VitD.Rx")
A.nvd.PUR <- sum(df$clinic=="A" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

A.prop.vd <- A.vd.PUR/A.vd
A.prop.nvd <- A.nvd.PUR/A.nvd
A.prop.vd.SE <- sqrt(A.prop.vd*(1-A.prop.vd)/A.vd)
A.prop.nvd.SE <- sqrt(A.prop.nvd*(1-A.prop.nvd)/A.nvd) 

#

B.vd <- sum(df$clinic=="B" & df$vitD.Rx=="VitD.Rx")
B.vd.PUR <- sum(df$clinic=="B" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
B.nvd <- sum(df$clinic=="B" & df$vitD.Rx!="VitD.Rx")
B.nvd.PUR <- sum(df$clinic=="B" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

B.prop.vd <- B.vd.PUR/B.vd
B.prop.nvd <- B.nvd.PUR/B.nvd
B.prop.vd.SE <- sqrt(B.prop.vd*(1-B.prop.vd)/B.vd)
B.prop.nvd.SE <- sqrt(B.prop.nvd*(1-B.prop.nvd)/B.nvd) 

#

C.vd <- sum(df$clinic=="C" & df$vitD.Rx=="VitD.Rx")
C.vd.PUR <- sum(df$clinic=="C" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
C.nvd <- sum(df$clinic=="C" & df$vitD.Rx!="VitD.Rx")
C.nvd.PUR <- sum(df$clinic=="C" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

C.prop.vd <- C.vd.PUR/C.vd
C.prop.nvd <- C.nvd.PUR/C.nvd
C.prop.vd.SE <- sqrt(C.prop.vd*(1-C.prop.vd)/C.vd)
C.prop.nvd.SE <- sqrt(C.prop.nvd*(1-C.prop.nvd)/C.nvd) 

#

D.vd <- sum(df$clinic=="D" & df$vitD.Rx=="VitD.Rx")
D.vd.PUR <- sum(df$clinic=="D" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
D.nvd <- sum(df$clinic=="D" & df$vitD.Rx!="VitD.Rx")
D.nvd.PUR <- sum(df$clinic=="D" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")

D.prop.vd <- D.vd.PUR/D.vd
D.prop.nvd <- D.nvd.PUR/D.nvd
D.prop.vd.SE <- sqrt(D.prop.vd*(1-D.prop.vd)/D.vd)
D.prop.nvd.SE <- sqrt(D.prop.nvd*(1-D.prop.nvd)/D.nvd) 

props <- c(A.prop.vd, A.prop.nvd, B.prop.vd, B.prop.nvd, C.prop.vd, C.prop.nvd, D.prop.vd, D.prop.nvd)
SE <- c(A.prop.vd.SE, A.prop.nvd.SE, B.prop.vd.SE, B.prop.nvd.SE, C.prop.vd.SE, C.prop.nvd.SE, D.prop.vd.SE, D.prop.nvd.SE)
clinic <- c("A", "A", "B", "B", "C", "C", "D", "D")
VitaminD <- rep(c("Vitamin D", "No Vitamin D"), 4)

clinics <- data.frame(clinic, VitaminD, props, SE)
clinics <- dplyr::mutate(clinics, props=100*props)

g <- ggplot(clinics, aes(x=clinic, y=props, fill=clinic))

g  +  geom_errorbar(aes(ymin=props-(0.1),
                        ymax=props+(100*SE)),
                    width=0.2, colour="black") + 
  geom_bar(stat="identity", colour="black") + facet_wrap(~VitaminD) + 
  theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
  scale_fill_brewer(palette = 4) +
  ylim(0,100) + guides(fill=FALSE) + xlab("Clinic site") +
  theme(strip.text=element_text(face="bold")) +
  ggtitle("Proportion PUR by clinic and Vit D status") +
  theme(plot.title=element_text(size=rel(1))) +
  ylab("Percent with PUR")

Clinic site C is an outlier, with lower rates of PUR, lower rates of vitamin D prescription, and is the only clinic site where an increased rate of PUR is not seen for patients prescribed vitamin D. Only 11 patients at clinic C were prescribed vitamin D, and none had a PUR.

Now three way comparisons between PUR and all combinations of 2 variables from set {ethnicity, vitamin D, clinic site, age}.

g <- ggpairs(df,
             showStrips = TRUE,
             mapping = ggplot2::aes(
               color = PUR,
               alpha=0.5),
             columns = c("ethnicity","clinic", "vitD.Rx", "age"))
g + theme_grey(base_size = 8)

9. Adjusting for a priori specified potential confounders

Vitamin D remains associated with PUR after adjusting for ethnicity in a logistic regression model:

vars <- c("PUR","ethnicity","clinic","age", "sex",
          "more1site", "microscopy.diagnosis", "culture.diagnosis", "histology.diagnosis",
          "baseline.steroid", "other.immuno.drug", 
          "lymphocytes", "monocytes", "neutrophils", "haemoglobin", 
          "ESR", "albumin", "CRP","vitD.baseline","vitD.6mths", "vitD.Rx")

dat <- df[ ,vars]


# ethnicity collapsed into 3 categories
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity=="Middle.Eastern"] <- "Asian"
dat$ethnicity[dat$ethnicity=="SE.Asian"] <- "Asian"
dat$ethnicity[dat$ethnicity=="South.Asian"] <- "Asian"
# vit D remains significant after adjusting for ethnicity
summary(glm(PUR~vitD.Rx+ethnicity, data=dat, family="binomial"))

## 
## Call:
## glm(formula = PUR ~ vitD.Rx + ethnicity, family = "binomial", 
##     data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9602  -0.4903  -0.4903  -0.3761   2.3169  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.7451     0.5114  -3.412 0.000645 ***
## vitD.RxVitD.Rx            1.2100     0.3789   3.193 0.001407 ** 
## ethnicityAsian           -0.3128     0.5267  -0.594 0.552604    
## ethnicityWhite.European  -0.8683     0.6779  -1.281 0.200227    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 209.29  on 248  degrees of freedom
## Residual deviance: 195.93  on 245  degrees of freedom
## AIC: 203.93
## 
## Number of Fisher Scoring iterations: 5

Because there are 0 patients with vitamin D prescription and PUR at clinic C, a standard logistic regression model cannot be fit with clinic as a co-variable. Instead an exact logistic regression model is used here, and shows vitamin D association with PUR is overall independent of clinic site.

datClin <- dat
# collapse to binary clinic variable "clinic C" or "not clinic C"
# as C is the outlier
datClin$clinic <- as.character(datClin$clinic)
datClin$clinic[datClin$clinic!="C"] <- "nonC"

# making a collapsed data frame for exact regression with elrm package
x <- xtabs(~PUR + interaction(clinic, vitD.Rx), data = datClin)

cdat <- cdat <- data.frame(clinic = rep(1:0, 2), vitD.Rx = rep(0:1, each = 2), 
                           PUR = x[2, ], ntrials = colSums(x))

exact.m <- elrm(formula = PUR/ntrials ~ vitD.Rx + clinic, 
                interest = ~ vitD.Rx + clinic,
                iter = 5005000, dataset = cdat, burnIn = 5000, r = 2)

## Progress:   0%  Progress:   5%  Progress:  10%  Progress:  15%  Progress:  20%  Progress:  25%  Progress:  30%  Progress:  35%  Progress:  40%  Progress:  45%  Progress:  50%  Progress:  55%  Progress:  60%  Progress:  65%  Progress:  70%  Progress:  75%  Progress:  80%  Progress:  85%  Progress:  90%  Progress:  95%  Progress: 100%

summary(exact.m)

## 
## Call:
## [[1]]
## elrm(formula = PUR/ntrials ~ vitD.Rx + clinic, interest = ~vitD.Rx + 
##     clinic, r = 2, iter = 5005000, dataset = cdat, burnIn = 5000)
## 
## 
## Results:
##         estimate p-value p-value_se mc_size
## joint         NA 0.00031    0.00001 5000000
## vitD.Rx  1.34424 0.00103    0.00011   83724
## clinic  -1.54588 0.04047    0.00556    2545
## 
## 95% Confidence Intervals for Parameters
## 
##              lower       upper
## vitD.Rx  0.4707123 2.558588197
## clinic  -4.8906887 0.004469184

10. ‘Full’ logistic regression model

All variables significant at the <0.05 level included except clinic (due to zero cell problem). Ethnicity also included as important possible confounder for vit D and PUR. Because ~12% of lymphocyte count observations are missing, in a non-random way, these values are imputed using MCMC. Lymphocyte count is transformed to fit more normal distribution.

# imputation on a reduced data frame
set.seed(123)
vars <- c("PUR","ethnicity","clinic","age", "sex",
          "more1site", "microscopy.diagnosis", "vitD.Rx",
          "baseline.steroid", "other.immuno.drug", 
          "lymphocytes", "monocytes", "CRP")
dat <- df[ ,vars]
dat <- mice(dat)

## 
##  iter imp variable
##   1   1  lymphocytes  monocytes  CRP
##   1   2  lymphocytes  monocytes  CRP
##   1   3  lymphocytes  monocytes  CRP
##   1   4  lymphocytes  monocytes  CRP
##   1   5  lymphocytes  monocytes  CRP
##   2   1  lymphocytes  monocytes  CRP
##   2   2  lymphocytes  monocytes  CRP
##   2   3  lymphocytes  monocytes  CRP
##   2   4  lymphocytes  monocytes  CRP
##   2   5  lymphocytes  monocytes  CRP
##   3   1  lymphocytes  monocytes  CRP
##   3   2  lymphocytes  monocytes  CRP
##   3   3  lymphocytes  monocytes  CRP
##   3   4  lymphocytes  monocytes  CRP
##   3   5  lymphocytes  monocytes  CRP
##   4   1  lymphocytes  monocytes  CRP
##   4   2  lymphocytes  monocytes  CRP
##   4   3  lymphocytes  monocytes  CRP
##   4   4  lymphocytes  monocytes  CRP
##   4   5  lymphocytes  monocytes  CRP
##   5   1  lymphocytes  monocytes  CRP
##   5   2  lymphocytes  monocytes  CRP
##   5   3  lymphocytes  monocytes  CRP
##   5   4  lymphocytes  monocytes  CRP
##   5   5  lymphocytes  monocytes  CRP

dat <- mice::complete(dat)

dat2 <- dat # save for later propensity score matching analysis

# a model with all univariate p<0.05 
# + ethnicity as an important potential confounder
# but no 'clinic'

# ethnicity collapsed to one binary variable
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity!="White.European"] <- "nonWhite.European"

full.m <- glm(PUR~vitD.Rx+I(log(lymphocytes+1))+more1site+microscopy.diagnosis+I(log(CRP+1))+ethnicity+I(log(age)),
            data=dat, family="binomial")

summary(full.m)

## 
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + 
##     microscopy.diagnosis + I(log(CRP + 1)) + ethnicity + I(log(age)), 
##     family = "binomial", data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4251  -0.5680  -0.3811  -0.2253   2.8868  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)              3.38913    2.34359   1.446  0.14814   
## vitD.Rx2                 1.23256    0.40845   3.018  0.00255 **
## I(log(lymphocytes + 1)) -1.36939    0.76119  -1.799  0.07202 . 
## more1site2               0.50294    0.40036   1.256  0.20904   
## microscopy.diagnosis2    1.11245    0.39725   2.800  0.00510 **
## I(log(CRP + 1))          0.06174    0.16020   0.385  0.69995   
## ethnicityWhite.European -0.05363    0.60390  -0.089  0.92924   
## I(log(age))             -1.47126    0.59445  -2.475  0.01332 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 209.29  on 248  degrees of freedom
## Residual deviance: 172.78  on 241  degrees of freedom
## AIC: 188.78
## 
## Number of Fisher Scoring iterations: 5

In this ‘full’ model, age, microscopy diagnosis (AFB seen on diagnostic sample), and vitamin D prescription remain significant.

11. Stepwise logistic regression models

Next a backwards stepwise selection of variables based on Akaike Information Criteria (AIC) impact of removing the variable was performed:

step.m <- stepAIC(full.m, direction = "backward")

## Start:  AIC=188.78
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + ethnicity + I(log(age))
## 
##                           Df Deviance    AIC
## - ethnicity                1   172.78 186.78
## - I(log(CRP + 1))          1   172.93 186.93
## - more1site                1   174.35 188.35
## <none>                         172.78 188.78
## - I(log(lymphocytes + 1))  1   176.10 190.10
## - I(log(age))              1   179.15 193.15
## - microscopy.diagnosis     1   180.68 194.68
## - vitD.Rx                  1   181.67 195.67
## 
## Step:  AIC=186.78
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + I(log(age))
## 
##                           Df Deviance    AIC
## - I(log(CRP + 1))          1   172.94 184.94
## - more1site                1   174.37 186.37
## <none>                         172.78 186.78
## - I(log(lymphocytes + 1))  1   176.11 188.11
## - microscopy.diagnosis     1   180.69 192.69
## - I(log(age))              1   181.00 193.00
## - vitD.Rx                  1   181.72 193.72
## 
## Step:  AIC=184.94
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(age))
## 
##                           Df Deviance    AIC
## - more1site                1   174.69 184.69
## <none>                         172.94 184.94
## - I(log(lymphocytes + 1))  1   177.28 187.28
## - microscopy.diagnosis     1   180.83 190.83
## - I(log(age))              1   181.35 191.35
## - vitD.Rx                  1   181.98 191.98
## 
## Step:  AIC=184.69
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age))
## 
##                           Df Deviance    AIC
## <none>                         174.69 184.69
## - I(log(lymphocytes + 1))  1   180.08 188.08
## - microscopy.diagnosis     1   182.54 190.54
## - I(log(age))              1   183.61 191.61
## - vitD.Rx                  1   184.70 192.70

summary(step.m)

## 
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age)), family = "binomial", data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3473  -0.5534  -0.3772  -0.2367   2.8400  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)   
## (Intercept)               4.1887     2.0252   2.068  0.03861 * 
## vitD.Rx2                  1.2901     0.4033   3.199  0.00138 **
## I(log(lymphocytes + 1))  -1.6228     0.7118  -2.280  0.02262 * 
## microscopy.diagnosis2     1.0974     0.3929   2.793  0.00522 **
## I(log(age))              -1.5270     0.5324  -2.868  0.00413 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 209.29  on 248  degrees of freedom
## Residual deviance: 174.69  on 244  degrees of freedom
## AIC: 184.69
## 
## Number of Fisher Scoring iterations: 5

In this model, age, microscopy diagnosis (AFB seen on diagnostic sample), vitamin D prescription, and lymphocyte count are significant.

This reduced model comapres favorably to the ‘full’ model. It has similar residual deviance and predictive performance in the data set, and is non-inferior in a -2 log likelihood test (with better AIC score):

PseudoR2(step.m)["Nagelkerke"]

## Nagelkerke 
##  0.2282075

PseudoR2(full.m)["Nagelkerke"]

## Nagelkerke 
##  0.2399237

PseudoR2(step.m)["AIC"]

##      AIC 
## 184.6892

PseudoR2(full.m)["AIC"]

##      AIC 
## 188.7761

step.m.p <- predict(step.m, type = "response")
full.m.p <- predict(full.m, type = "response")

roc(dat$PUR, step.m.p, data=dat)$auc

## Area under the curve: 0.7743

roc(dat$PUR, full.m.p, data=dat)$auc

## Area under the curve: 0.7815

anova(full.m, step.m, test ="Chisq")

## Analysis of Deviance Table
## 
## Model 1: PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + ethnicity + I(log(age))
## Model 2: PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age))
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       241     172.78                     
## 2       244     174.69 -3  -1.9131   0.5906

An automated search for an improved model including all possible two way interactions between age, microscopy diagnosis (AFB seen on diagnostic sample), vitamin D prescription, and lymphocyte count was performed. The best model including interaction terms was not significantly better than the previous stepwise model.

search <- step(step.m, ~.^2)

## Start:  AIC=184.69
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age))
## 
##                                                Df Deviance    AIC
## + I(log(lymphocytes + 1)):microscopy.diagnosis  1   172.56 184.56
## <none>                                              174.69 184.69
## + microscopy.diagnosis:I(log(age))              1   173.73 185.73
## + vitD.Rx:microscopy.diagnosis                  1   173.90 185.90
## + vitD.Rx:I(log(age))                           1   174.54 186.54
## + I(log(lymphocytes + 1)):I(log(age))           1   174.57 186.57
## + vitD.Rx:I(log(lymphocytes + 1))               1   174.68 186.68
## - I(log(lymphocytes + 1))                       1   180.08 188.08
## - microscopy.diagnosis                          1   182.54 190.54
## - I(log(age))                                   1   183.61 191.61
## - vitD.Rx                                       1   184.70 192.70
## 
## Step:  AIC=184.56
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis + 
##     I(log(age)) + I(log(lymphocytes + 1)):microscopy.diagnosis
## 
##                                                Df Deviance    AIC
## <none>                                              172.56 184.56
## - I(log(lymphocytes + 1)):microscopy.diagnosis  1   174.69 184.69
## + vitD.Rx:microscopy.diagnosis                  1   171.57 185.57
## + microscopy.diagnosis:I(log(age))              1   171.95 185.95
## + I(log(lymphocytes + 1)):I(log(age))           1   172.12 186.12
## + vitD.Rx:I(log(age))                           1   172.49 186.49
## + vitD.Rx:I(log(lymphocytes + 1))               1   172.54 186.54
## - I(log(age))                                   1   182.02 192.02
## - vitD.Rx                                       1   182.35 192.35

anova(search)

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: PUR
## 
## Terms added sequentially (first to last)
## 
## 
##                                              Df Deviance Resid. Df
## NULL                                                           248
## vitD.Rx                                       1  11.5120       247
## I(log(lymphocytes + 1))                       1   6.1153       246
## microscopy.diagnosis                          1   8.0510       245
## I(log(age))                                   1   8.9235       244
## I(log(lymphocytes + 1)):microscopy.diagnosis  1   2.1312       243
##                                              Resid. Dev
## NULL                                             209.29
## vitD.Rx                                          197.78
## I(log(lymphocytes + 1))                          191.66
## microscopy.diagnosis                             183.61
## I(log(age))                                      174.69
## I(log(lymphocytes + 1)):microscopy.diagnosis     172.56

12. Overly Influential cases

Using the stepwise model without interaction terms, cases with disproportionate influence were identified (defined as Cook’s distance > 4 standard deviations). These 7 cases were removed and full and stepwise models were rerun to give a final model.

summary(cooks.distance(step.m))

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000094 0.0000883 0.0003998 0.0042930 0.0024280 0.0666300

cutoff <- sd(cooks.distance(step.m))*4
plot(cooks.distance(step.m))
abline(h=cutoff, col="darkgreen")

dat.reduced <- dat[cooks.distance(step.m)<cutoff,]

cooksAdj.full.m <- glm(PUR~vitD.Rx+I(log(lymphocytes+1))+more1site+microscopy.diagnosis+I(log(CRP+1))+ethnicity+I(log(age)),
              data=dat.reduced, family="binomial")

cooksAdj.step.m <- stepAIC(cooksAdj.full.m, direction = "backward")

## Start:  AIC=151.37
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + ethnicity + I(log(age))
## 
##                           Df Deviance    AIC
## - ethnicity                1   135.43 149.43
## - I(log(CRP + 1))          1   135.45 149.45
## <none>                         135.37 151.37
## - more1site                1   138.53 152.53
## - I(log(lymphocytes + 1))  1   140.89 154.89
## - vitD.Rx                  1   141.66 155.66
## - microscopy.diagnosis     1   144.31 158.31
## - I(log(age))              1   145.92 159.92
## 
## Step:  AIC=149.43
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(CRP + 1)) + I(log(age))
## 
##                           Df Deviance    AIC
## - I(log(CRP + 1))          1   135.53 147.53
## <none>                         135.43 149.43
## - more1site                1   138.63 150.63
## - I(log(lymphocytes + 1))  1   140.96 152.96
## - vitD.Rx                  1   141.77 153.77
## - microscopy.diagnosis     1   144.31 156.31
## - I(log(age))              1   149.09 161.09
## 
## Step:  AIC=147.53
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis + 
##     I(log(age))
## 
##                           Df Deviance    AIC
## <none>                         135.53 147.53
## - more1site                1   138.94 148.94
## - vitD.Rx                  1   141.91 151.91
## - I(log(lymphocytes + 1))  1   142.33 152.33
## - microscopy.diagnosis     1   144.36 154.36
## - I(log(age))              1   149.50 159.50

final.m <- cooksAdj.step.m

13. Description final model

summary(final.m)

## 
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + 
##     microscopy.diagnosis + I(log(age)), family = "binomial", 
##     data = dat.reduced)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5126  -0.4609  -0.2795  -0.1544   2.5752  
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               6.7808     2.6035   2.605 0.009200 ** 
## vitD.Rx2                  1.1874     0.4646   2.556 0.010588 *  
## I(log(lymphocytes + 1))  -2.2648     0.9027  -2.509 0.012108 *  
## more1site2                0.8309     0.4533   1.833 0.066827 .  
## microscopy.diagnosis2     1.3175     0.4494   2.932 0.003368 ** 
## I(log(age))              -2.3408     0.6793  -3.446 0.000569 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 181.38  on 241  degrees of freedom
## Residual deviance: 135.53  on 236  degrees of freedom
## AIC: 147.53
## 
## Number of Fisher Scoring iterations: 6

with(final.m, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))

## [1] 9.716045e-09

final.m.p <- predict(final.m, type = "response")
roc(dat.reduced$PUR, final.m.p, data=dat.reduced)$auc

## Area under the curve: 0.8437

PseudoR2(final.m)["Nagelkerke"]

## Nagelkerke 
##  0.3272972

PseudoR2(final.m)["AIC"]

##      AIC 
## 147.5254

14. Propensity score matching analysis

As seen above clinic site was related to both probability of vitamin D prescribing and probability of PUR occurance (hence a significant risk is a confounder), but could not be included in the final logistic regression model due to zero cell occurence. As an additional check that results are robust despite this problem, here a propensity score analysis is performed where cases are matched on propensity for vitamin D to be prescribed, using covariates that were related to both vitamin D prescription and PUR outcome. Using the (smaller) matched dataset an assocation between vitamin D prescribing and PUR is still found.

### *** uses a earlier version of dat before ethnicity collapsed to binary

# how much do the co-factors influence vitamin D Rx propensity / probability?
vitDRx.m <- glm(vitD.Rx~age+ethnicity+clinic, data=dat2,
                family="binomial")
summary(vitDRx.m)

## 
## Call:
## glm(formula = vitD.Rx ~ age + ethnicity + clinic, family = "binomial", 
##     data = dat2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5549  -0.6141  -0.4020  -0.1956   2.2777  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.64112    0.77109  -3.425 0.000614 ***
## age         -0.00149    0.01308  -0.114 0.909280    
## ethnicity2   1.84059    1.62751   1.131 0.258089    
## ethnicity3  -1.36501    1.18718  -1.150 0.250228    
## ethnicity4   0.21478    0.50573   0.425 0.671056    
## ethnicity5  -1.19302    0.66990  -1.781 0.074929 .  
## clinic2      2.33300    0.48413   4.819 1.44e-06 ***
## clinic3      1.72506    0.55254   3.122 0.001796 ** 
## clinic4      2.94495    0.57766   5.098 3.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 267.91  on 248  degrees of freedom
## Residual deviance: 216.06  on 240  degrees of freedom
## AIC: 234.06
## 
## Number of Fisher Scoring iterations: 5

library(BaylorEdPsych)
PseudoR2(vitDRx.m)["Nagelkerke"]

## Nagelkerke 
##  0.2852224

vitDRx.m.p <- predict(vitDRx.m, type = "response")
library(pROC)
roc(dat2$vitD.Rx, vitDRx.m.p, data=dat2)$auc

## Area under the curve: 0.7908

### propensity score matching with MatchIt 
library(MatchIt)
# grouping variable is dat$vitD.Rx
# matching variables chosen : age, clinic, ethnicity

# needs binary or boolean grouping variable
dat2$vitD.Rx <- as.character(dat2$vitD.Rx)
dat2$vitD.Rx[dat2$vitD.Rx=="Nil.vitD.Rx"] <- 0
dat2$vitD.Rx[dat2$vitD.Rx=="VitD.Rx"] <- 1
dat2$vitD.Rx <- as.numeric(dat2$vitD.Rx)

#matching treatments:controls in 1:1 ratio
m.out <- matchit(vitD.Rx ~ age + clinic + ethnicity,
                data = dat2, method = "nearest", ratio = 1) 
summary(m.out)

## 
## Call:
## matchit(formula = vitD.Rx ~ age + clinic + ethnicity, data = dat2, 
##     method = "nearest", ratio = 1)
## 
## Summary of balance for all data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.1824     0.1708    0.2032  0.2024
## age              37.3158       41.9333    16.6348   -4.6175  4.7000
## clinicA           0.1228        0.5052     0.5013   -0.3824  0.0000
## clinicB           0.4211        0.1979     0.3995    0.2231  0.0000
## clinicC           0.1930        0.2188     0.4145   -0.0258  0.0000
## clinicD           0.2632        0.0781     0.2691    0.1850  0.0000
## ethnicity2        0.0175        0.0052     0.0722    0.0123  0.0000
## ethnicity3        0.0175        0.0521     0.2228   -0.0345  0.0000
## ethnicity4        0.6842        0.5729     0.4959    0.1113  0.0000
## ethnicity5        0.1228        0.2760     0.4482   -0.1532  0.0000
##            eQQ Mean eQQ Max
## distance     0.2056  0.3775
## age          4.8105 13.5000
## clinicA      0.3860  1.0000
## clinicB      0.2281  1.0000
## clinicC      0.0351  1.0000
## clinicD      0.1754  1.0000
## ethnicity2   0.0000  0.0000
## ethnicity3   0.0351  1.0000
## ethnicity4   0.1228  1.0000
## ethnicity5   0.1579  1.0000
## 
## 
## Summary of balance for matched data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.3750     0.1688    0.0106  0.0013
## age              37.3158       37.0596    13.8588    0.2561  1.4000
## clinicA           0.1228        0.1053     0.3096    0.0175  0.0000
## clinicB           0.4211        0.4386     0.5006   -0.0175  0.0000
## clinicC           0.1930        0.2456     0.4343   -0.0526  0.0000
## clinicD           0.2632        0.2105     0.4113    0.0526  0.0000
## ethnicity2        0.0175        0.0000     0.0000    0.0175  0.0000
## ethnicity3        0.0175        0.0175     0.1325    0.0000  0.0000
## ethnicity4        0.6842        0.7368     0.4443   -0.0526  0.0000
## ethnicity5        0.1228        0.1228     0.3311    0.0000  0.0000
##            eQQ Mean eQQ Max
## distance     0.0111   0.096
## age          2.0737   7.500
## clinicA      0.0175   1.000
## clinicB      0.0175   1.000
## clinicC      0.0526   1.000
## clinicD      0.0526   1.000
## ethnicity2   0.0175   1.000
## ethnicity3   0.0000   0.000
## ethnicity4   0.0526   1.000
## ethnicity5   0.0000   0.000
## 
## Percent Balance Improvement:
##            Mean Diff. eQQ Med eQQ Mean  eQQ Max
## distance      94.7811 99.3405  94.6230  74.5751
## age           94.4529 70.2128  56.8928  44.4444
## clinicA       95.4122  0.0000  95.4545   0.0000
## clinicB       92.1376  0.0000  92.3077   0.0000
## clinicC     -104.2553  0.0000 -50.0000   0.0000
## clinicD       71.5556  0.0000  70.0000   0.0000
## ethnicity2   -42.2222  0.0000     -Inf     -Inf
## ethnicity3   100.0000  0.0000 100.0000 100.0000
## ethnicity4    52.7094  0.0000  57.1429   0.0000
## ethnicity5   100.0000  0.0000 100.0000 100.0000
## 
## Sample sizes:
##           Control Treated
## All           192      57
## Matched        57      57
## Unmatched     135       0
## Discarded       0       0

# plot(m.out, type = "jitter")
plot(m.out, type = "hist")

# extract the matched data
m.data1 <- match.data(m.out)

# check vit D Rx still associated with PUR after matching
matched.m <- glm(PUR~vitD.Rx, data=m.data1, family="binomial")
summary(matched.m)

## 
## Call:
## glm(formula = PUR ~ vitD.Rx, family = "binomial", data = m.data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8416  -0.8416  -0.5500  -0.5500   1.9817  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8124     0.3813  -4.753 2.01e-06 ***
## vitD.Rx       0.9567     0.4788   1.998   0.0457 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 119.93  on 113  degrees of freedom
## Residual deviance: 115.71  on 112  degrees of freedom
## AIC: 119.71
## 
## Number of Fisher Scoring iterations: 4

exp(confint(matched.m))

##                  2.5 %    97.5 %
## (Intercept) 0.07150922 0.3254047
## vitD.Rx     1.04469433 6.9629099

exp(coef(matched.m))

## (Intercept)     vitD.Rx 
##   0.1632653   2.6031250

#matching treatments:controls in 1:2 ratio
m.out <- matchit(vitD.Rx ~ age + clinic + ethnicity,
                data = dat2, method = "nearest", ratio = 2) 
summary(m.out)

## 
## Call:
## matchit(formula = vitD.Rx ~ age + clinic + ethnicity, data = dat2, 
##     method = "nearest", ratio = 2)
## 
## Summary of balance for all data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.1824     0.1708    0.2032  0.2024
## age              37.3158       41.9333    16.6348   -4.6175  4.7000
## clinicA           0.1228        0.5052     0.5013   -0.3824  0.0000
## clinicB           0.4211        0.1979     0.3995    0.2231  0.0000
## clinicC           0.1930        0.2188     0.4145   -0.0258  0.0000
## clinicD           0.2632        0.0781     0.2691    0.1850  0.0000
## ethnicity2        0.0175        0.0052     0.0722    0.0123  0.0000
## ethnicity3        0.0175        0.0521     0.2228   -0.0345  0.0000
## ethnicity4        0.6842        0.5729     0.4959    0.1113  0.0000
## ethnicity5        0.1228        0.2760     0.4482   -0.1532  0.0000
##            eQQ Mean eQQ Max
## distance     0.2056  0.3775
## age          4.8105 13.5000
## clinicA      0.3860  1.0000
## clinicB      0.2281  1.0000
## clinicC      0.0351  1.0000
## clinicD      0.1754  1.0000
## ethnicity2   0.0000  0.0000
## ethnicity3   0.0351  1.0000
## ethnicity4   0.1228  1.0000
## ethnicity5   0.1579  1.0000
## 
## 
## Summary of balance for matched data:
##            Means Treated Means Control SD Control Mean Diff eQQ Med
## distance          0.3856        0.2654     0.1782    0.1202  0.1427
## age              37.3158       42.1219    17.5984   -4.8061  5.8000
## clinicA           0.1228        0.1667     0.3743   -0.0439  0.0000
## clinicB           0.4211        0.3333     0.4735    0.0877  0.0000
## clinicC           0.1930        0.3684     0.4845   -0.1754  0.0000
## clinicD           0.2632        0.1316     0.3395    0.1316  0.0000
## ethnicity2        0.0175        0.0088     0.0937    0.0088  0.0000
## ethnicity3        0.0175        0.0526     0.2243   -0.0351  0.0000
## ethnicity4        0.6842        0.5088     0.5021    0.1754  0.0000
## ethnicity5        0.1228        0.3246     0.4703   -0.2018  0.0000
##            eQQ Mean eQQ Max
## distance     0.1236  0.2842
## age          5.3386 12.2000
## clinicA      0.0351  1.0000
## clinicB      0.0877  1.0000
## clinicC      0.1754  1.0000
## clinicD      0.1404  1.0000
## ethnicity2   0.0000  0.0000
## ethnicity3   0.0351  1.0000
## ethnicity4   0.1754  1.0000
## ethnicity5   0.1930  1.0000
## 
## Percent Balance Improvement:
##            Mean Diff.  eQQ Med  eQQ Mean eQQ Max
## distance      40.8360  29.5173   39.8790 24.7025
## age           -4.0843 -23.4043  -10.9774  9.6296
## clinicA       88.5305   0.0000   90.9091  0.0000
## clinicB       60.6880   0.0000   61.5385  0.0000
## clinicC     -580.8511   0.0000 -400.0000  0.0000
## clinicD       28.8889   0.0000   20.0000  0.0000
## ethnicity2    28.8889   0.0000    0.0000  0.0000
## ethnicity3    -1.5873   0.0000    0.0000  0.0000
## ethnicity4   -57.6355   0.0000  -42.8571  0.0000
## ethnicity5   -31.6637   0.0000  -22.2222  0.0000
## 
## Sample sizes:
##           Control Treated
## All           192      57
## Matched       114      57
## Unmatched      78       0
## Discarded       0       0

# plot(m.out, type = "jitter")
plot(m.out, type = "hist")

# extract the matched data
m.data1 <- match.data(m.out)

# check vit D Rx still associated with PUR after matching
matched.m <- glm(PUR~vitD.Rx, data=m.data1, family="binomial")
summary(matched.m)

## 
## Call:
## glm(formula = PUR ~ vitD.Rx, family = "binomial", data = m.data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8416  -0.5312  -0.5312  -0.5312   2.0140  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.8871     0.2771  -6.811  9.7e-12 ***
## vitD.Rx       1.0314     0.4007   2.574   0.0101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 164.86  on 170  degrees of freedom
## Residual deviance: 158.25  on 169  degrees of freedom
## AIC: 162.25
## 
## Number of Fisher Scoring iterations: 4

exp(confint(matched.m))

##                 2.5 %    97.5 %
## (Intercept) 0.0845356 0.2524428
## vitD.Rx     1.2801606 6.2206071

exp(coef(matched.m))

## (Intercept)     vitD.Rx 
##   0.1515152   2.8050000