#packages
library(RCurl)
library(knitr)
library(elrm)
library(psych)
library(survival)
library(mice)
library(VIM)
library(MASS)
library(BaylorEdPsych)
library(pROC)
library(GGally)
library(tableone)
library(dplyr)
library(ggplot2)
# data read in
url <- "https://raw.githubusercontent.com/davidadambarr/EPTB_PUR/master/PURTB_raw.csv"
x <- getURL(url)
df <- read.csv(text = x)
### coerce data types and clean data
#dates
format <- "%d/%m/%Y"
df$PUR.date <- as.Date(as.character(df$PUR.date),format)
df$Date.TB.Rx.start <- as.Date(as.character(df$Date.TB.Rx.start),format)
df$Date.TB.Rx.finish <- as.Date(as.character(df$Date.TB.Rx.finish),format)
df$VitD.Rx.Date <- as.Date(as.character(df$VitD.Rx.Date),format)
# below limit of detection values
df$CRP <- as.character(df$CRP)
df$CRP[df$CRP=="<0.2"] <- "0.2"
df$CRP[df$CRP=="<3"] <- "3"
df$CRP <- as.numeric(df$CRP)
### create 'timing' variables
df$PUR.time <- as.numeric(df$PUR.date - df$Date.TB.Rx.start)
### create a 'number of sites' variable
df <- dplyr::mutate(df, NumSites=pleural+internal.LN+external.LN+CNS+BJI+pericardial+abdominal+other.site)
df <- dplyr::mutate(df, more1site=NumSites>1|miliary==1)
df$more1site <- as.factor(df$more1site)
### Table 1
df <- dplyr::select(df, -number)
vars <- c("age", "sex", "PUR", "vitD.Rx", "ethnicity", "clinic",
"lymphocytes","monocytes","neutrophils","haemoglobin","ESR","albumin","CRP",
"vitD.baseline","vitD.6mths","vitD.12mths",
"pleural","internal.LN","external.LN","CNS", "BJI","pericardial","abdominal","miliary","other.site",
"microscopy.diagnosis","culture.diagnosis","histology.diagnosis", "basis.diagnosis", "more1site",
"baseline.steroid","other.immuno.drug",
"PUR.Rx","ADR","prior.hypercalcaemia","prior.hypocalcaemia","calcaemia_during.TBRx")
# numeric site variables to factors
siteVars <- c("pleural","internal.LN","external.LN","CNS", "BJI","pericardial","abdominal","miliary","other.site")
df[ ,siteVars] <- lapply(df[ ,siteVars], as.factor)
catVars <- sapply(df, is.factor)
numVars <- sapply(df, is.numeric)
catVars <- names(df)[catVars]
numVars <- names(df)[numVars]
# nonparametric and exact tests for p values
# overall cohort
tbl1.2 <- tableone::CreateTableOne(data=df, vars=vars)
tbl1.2 <- print(tbl1.2, showAllLevels = FALSE, format = "f",
nonnormal=numVars,
quote = FALSE, noSpaces = TRUE, printToggle = FALSE)
# stratified by vit D prescription status
tbl1 <- tableone::CreateTableOne(data=df, vars=vars, strata = "vitD.Rx")
tbl1 <- print(tbl1, showAllLevels = FALSE, format = "f",
exact=catVars, nonnormal=numVars,
quote = FALSE, noSpaces = TRUE, printToggle = FALSE)
Printing tables. First is the overall cohort description:
kable(tbl1.2, format = "markdown")
| Overall | |
|---|---|
| n | 249 |
| age (median [IQR]) | 36.00 [28.90, 50.70] |
| sex = male | 158 |
| PUR = PUR | 37 |
| vitD.Rx = VitD.Rx | 57 |
| ethnicity | |
| African | 27 |
| Middle.Eastern | 2 |
| SE.Asian | 11 |
| South.Asian | 149 |
| White.European | 60 |
| clinic | |
| A | 104 |
| B | 62 |
| C | 53 |
| D | 30 |
| lymphocytes (median [IQR]) | 1.40 [1.02, 1.90] |
| monocytes (median [IQR]) | 0.60 [0.43, 0.80] |
| neutrophils (median [IQR]) | 4.50 [3.40, 6.08] |
| haemoglobin (median [IQR]) | 130.00 [115.00, 140.00] |
| ESR (median [IQR]) | 29.00 [13.00, 51.00] |
| albumin (median [IQR]) | 34.00 [30.00, 38.00] |
| CRP (median [IQR]) | 23.00 [5.00, 65.00] |
| vitD.baseline (median [IQR]) | 10.00 [7.00, 19.00] |
| vitD.6mths (median [IQR]) | 26.00 [13.00, 44.75] |
| vitD.12mths (median [IQR]) | 7.00 [7.00, 17.75] |
| pleural = 1 | 48 |
| internal.LN = 1 | 134 |
| external.LN = 1 | 96 |
| CNS = 1 | 6 |
| BJI = 1 | 36 |
| pericardial = 1 | 9 |
| abdominal = 1 | 47 |
| miliary = 1 | 2 |
| other.site = 1 | 22 |
| microscopy.diagnosis = positive | 80 |
| culture.diagnosis = positive | 153 |
| histology.diagnosis = positive | 138 |
| basis.diagnosis = micro_or_histo_confirmed | 222 |
| more1site = TRUE | 96 |
| baseline.steroid = Yes | 54 |
| other.immuno.drug = Yes | 11 |
| PUR.Rx | |
| Extended.course.TB.therapy | 5 |
| None | 12 |
| NSAID | 2 |
| Oral.Steroid | 11 |
| Percutaneous.Aspiration | 2 |
| Percutaneous.Aspiration.Oral.Steroid | 3 |
| Surgery | 1 |
| ADR = No_recorded_ADR | 182 |
| prior.hypercalcaemia = Prior_hypercalcaemia | 15 |
| prior.hypocalcaemia = Prior_hypocalcaemia | 7 |
| calcaemia_during.TBRx | |
| hypercalcaemic | 10 |
| hypocalcaemia | 4 |
| normocalcaemic | 192 |
Second, whole cohort description stratified by vitamin D prescription status:
kable(tbl1, format = "markdown")
| Nil.vitD.Rx | VitD.Rx | p | test | |
|---|---|---|---|---|
| n | 192 | 57 | ||
| age (median [IQR]) | 38.65 [29.48, 52.33] | 32.70 [26.80, 43.40] | 0.055 | nonnorm |
| sex = male | 123 | 35 | 0.755 | exact |
| PUR = PUR | 20 | 17 | 0.001 | exact |
| vitD.Rx = VitD.Rx | 0 | 57 | <0.001 | exact |
| ethnicity | 0.039 | exact | ||
| African | 18 | 9 | ||
| Middle.Eastern | 1 | 1 | ||
| SE.Asian | 10 | 1 | ||
| South.Asian | 110 | 39 | ||
| White.European | 53 | 7 | ||
| clinic | <0.001 | exact | ||
| A | 97 | 7 | ||
| B | 38 | 24 | ||
| C | 42 | 11 | ||
| D | 15 | 15 | ||
| lymphocytes (median [IQR]) | 1.40 [1.04, 1.90] | 1.34 [0.97, 1.82] | 0.592 | nonnorm |
| monocytes (median [IQR]) | 0.61 [0.50, 0.80] | 0.56 [0.40, 0.80] | 0.177 | nonnorm |
| neutrophils (median [IQR]) | 4.55 [3.40, 5.90] | 4.41 [3.38, 6.78] | 0.665 | nonnorm |
| haemoglobin (median [IQR]) | 130.00 [116.00, 141.00] | 125.50 [111.00, 137.00] | 0.223 | nonnorm |
| ESR (median [IQR]) | 33.00 [13.00, 50.00] | 26.50 [10.00, 61.25] | 0.944 | nonnorm |
| albumin (median [IQR]) | 35.00 [30.00, 38.00] | 33.00 [28.00, 37.00] | 0.131 | nonnorm |
| CRP (median [IQR]) | 21.00 [4.60, 64.75] | 29.00 [6.00, 65.00] | 0.505 | nonnorm |
| vitD.baseline (median [IQR]) | 16.00 [7.00, 26.00] | 7.00 [7.00, 10.00] | 0.004 | nonnorm |
| vitD.6mths (median [IQR]) | 24.00 [18.75, 28.50] | 26.00 [9.25, 58.50] | 0.846 | nonnorm |
| vitD.12mths (median [IQR]) | 8.50 [7.00, 17.75] | 7.00 [7.00, 15.75] | 0.817 | nonnorm |
| pleural = 1 | 39 | 9 | 0.567 | exact |
| internal.LN = 1 | 103 | 31 | 1.000 | exact |
| external.LN = 1 | 70 | 26 | 0.219 | exact |
| CNS = 1 | 4 | 2 | 0.623 | exact |
| BJI = 1 | 21 | 15 | 0.009 | exact |
| pericardial = 1 | 6 | 3 | 0.432 | exact |
| abdominal = 1 | 34 | 13 | 0.441 | exact |
| miliary = 1 | 1 | 1 | 0.406 | exact |
| other.site = 1 | 17 | 5 | 1.000 | exact |
| microscopy.diagnosis = positive | 62 | 18 | 1.000 | exact |
| culture.diagnosis = positive | 111 | 42 | 0.031 | exact |
| histology.diagnosis = positive | 103 | 35 | 0.363 | exact |
| basis.diagnosis = micro_or_histo_confirmed | 167 | 55 | 0.051 | exact |
| more1site = TRUE | 70 | 26 | 0.219 | exact |
| baseline.steroid = Yes | 42 | 12 | 1.000 | exact |
| other.immuno.drug = Yes | 10 | 1 | 0.465 | exact |
| PUR.Rx | 0.058 | exact | ||
| Extended.course.TB.therapy | 5 | 0 | ||
| None | 6 | 6 | ||
| NSAID | 0 | 2 | ||
| Oral.Steroid | 4 | 7 | ||
| Percutaneous.Aspiration | 2 | 0 | ||
| Percutaneous.Aspiration.Oral.Steroid | 1 | 2 | ||
| Surgery | 1 | 0 | ||
| ADR = No_recorded_ADR | 140 | 42 | 0.864 | exact |
| prior.hypercalcaemia = Prior_hypercalcaemia | 14 | 1 | 0.203 | exact |
| prior.hypocalcaemia = Prior_hypocalcaemia | 4 | 3 | 0.362 | exact |
| calcaemia_during.TBRx | 0.879 | exact | ||
| hypercalcaemic | 7 | 3 | ||
| hypocalcaemia | 3 | 1 | ||
| normocalcaemic | 144 | 48 |
Note that there are several potential confounders including clinic site and ethnicity as predicted a priori.
Distribution of timing first PUR event in those patients who experienced a PUR.
# timing
summary(df$PUR.time)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 10.0 27.0 52.5 100.4 167.8 504.0 213
ggplot(df, aes(x=PUR.time)) + geom_histogram() + ggtitle("Timing of first PUR, days from start TB Rx")
ggplot(df, aes(x=log(PUR.time))) + geom_histogram(bins=10)+
ggtitle("Timing of first PUR, log(days)")
time <- df$PUR.time
event <- as.numeric(df$PUR=="PUR")
# looking at PUR positive patients only:
kmsurvival <- survfit(Surv(time,event) ~ 1)
plot(kmsurvival, xlab="Time in days from start RHZE",ylab="Proportion no PUR")
title(main = "Timing of PURs")
Treatment modalities used for PUR cases.
# treatment of PUR
table(df$PUR.Rx)
##
## Extended.course.TB.therapy None
## 5 12
## NSAID Oral.Steroid
## 2 11
## Percutaneous.Aspiration Percutaneous.Aspiration.Oral.Steroid
## 2 3
## Surgery
## 1
Several of the distributions look more normal after transformation. For example, lymphocyte count:
par(mfrow=c(2,2))
hist(df$lymphocytes, main = "Lymphocyte count histogram")
qqnorm(df$lymphocytes, main = "Lymphocyte count QQ plot")
hist(log(df$lymphocytes+1), main = "After transformation histogram")
qqnorm(log(df$lymphocytes+1), main = "After transformation QQ plot")
par(mfrow=c(1,1))
Here, proportion of observations missing is plotted by variable:
vars <- c("PUR","ethnicity","clinic","age", "sex",
"more1site",
"microscopy.diagnosis", "culture.diagnosis", "histology.diagnosis",
"baseline.steroid", "other.immuno.drug",
"lymphocytes", "monocytes", "neutrophils", "haemoglobin",
"ESR", "albumin", "CRP","vitD.baseline","vitD.6mths", "vitD.Rx")
dat <- df[ ,vars]
# check for missingness by variable
missing <- data.frame(colSums(is.na(dat)))
vars <- rownames(missing)
missing <- cbind(missing, vars)
n <- nrow(dat)
names(missing) <- c("n_missing", "vars")
missing$vars <- as.character(missing$vars)
missing <- dplyr::mutate(missing, prop_missing=n_missing/n)
p2 <- ggplot(missing, aes(x=reorder(vars, -prop_missing), y=prop_missing))
p2 + geom_bar(stat="identity") + theme(axis.text.x=element_text(angle=90, hjust=1)) +
ggtitle("Proportion missing by variable")
Here, all the data is visualised by variable (columns) and by patient (rows). Higher values are a darker greyscale shade, absence of a binary variable is white and presence of a binary variable is black. Missing observations are coloured red.
mar.default <- par("mar")
cex.default <- par("cex")
par(mar=c(12, 4.1,4.1,2.1), cex=0.7)
matrixplot(dat)
par(mar=c(mar.default))
par(cex=c(cex.default))
**Note Vit D serum levels and ESR are majority incomplete so can’t easily be used in later regression modeling. Other baseline blood results are about 10-12% missing and they appear to be missing in non-random way (e.g. some clinic sites more complete than others. This means deletion of cases with missing observations may introduce bias in regression modelling and suggests imputation of missing observations is more appropriate fix.
vars <- c("age", "sex", "vitD.Rx", "ethnicity", "clinic",
"lymphocytes","monocytes","neutrophils","haemoglobin","ESR","albumin","CRP",
"vitD.baseline", "more1site",
"microscopy.diagnosis","culture.diagnosis","histology.diagnosis", "basis.diagnosis",
"baseline.steroid","other.immuno.drug","PUR")
nonnorm <- c("lymphocytes", "monocytes", "neutrophils", "haemoglobin", "ESR",
"albumin", "CRP", "age", "vitD.baseline")
catVars <- sapply(df, is.factor)
numVars <- sapply(df, is.numeric)
catVars <- names(df)[catVars]
numVars <- names(df)[numVars]
catVars <- dplyr::intersect(catVars, vars)
tbl2 <- tableone::CreateTableOne(data=df, vars=vars, strata = "PUR")
tbl2 <- print(tbl2, showAllLevels = FALSE, format = "f",
exact=catVars, nonnormal=nonnorm,
quote = FALSE, noSpaces = TRUE, printToggle = FALSE)
Print table of anivariate associations:
kable(tbl2, format = "markdown")
| No_PUR | PUR | p | test | |
|---|---|---|---|---|
| n | 212 | 37 | ||
| age (median [IQR]) | 38.05 [29.95, 52.80] | 30.00 [23.50, 42.30] | 0.002 | nonnorm |
| sex = male | 131 | 27 | 0.267 | exact |
| vitD.Rx = VitD.Rx | 40 | 17 | 0.001 | exact |
| ethnicity | 0.374 | exact | ||
| African | 21 | 6 | ||
| Middle.Eastern | 2 | 0 | ||
| SE.Asian | 9 | 2 | ||
| South.Asian | 125 | 24 | ||
| White.European | 55 | 5 | ||
| clinic | 0.009 | exact | ||
| A | 92 | 12 | ||
| B | 49 | 13 | ||
| C | 50 | 3 | ||
| D | 21 | 9 | ||
| lymphocytes (median [IQR]) | 1.43 [1.10, 1.90] | 1.14 [0.83, 1.51] | 0.013 | nonnorm |
| monocytes (median [IQR]) | 0.60 [0.47, 0.80] | 0.66 [0.40, 0.81] | 0.649 | nonnorm |
| neutrophils (median [IQR]) | 4.48 [3.40, 5.93] | 4.95 [3.35, 6.62] | 0.696 | nonnorm |
| haemoglobin (median [IQR]) | 130.00 [116.00, 141.00] | 124.00 [114.00, 137.25] | 0.306 | nonnorm |
| ESR (median [IQR]) | 29.00 [13.00, 49.00] | 32.50 [9.50, 61.25] | 0.609 | nonnorm |
| albumin (median [IQR]) | 34.00 [30.00, 38.00] | 34.00 [29.00, 37.50] | 0.641 | nonnorm |
| CRP (median [IQR]) | 20.00 [4.93, 54.75] | 55.00 [13.00, 72.50] | 0.045 | nonnorm |
| vitD.baseline (median [IQR]) | 10.00 [7.00, 20.00] | 7.00 [7.00, 10.00] | 0.219 | nonnorm |
| more1site = TRUE | 76 | 20 | 0.044 | exact |
| microscopy.diagnosis = positive | 60 | 20 | 0.004 | exact |
| culture.diagnosis = positive | 125 | 28 | 0.067 | exact |
| histology.diagnosis = positive | 114 | 24 | 0.282 | exact |
| basis.diagnosis = micro_or_histo_confirmed | 186 | 36 | 0.146 | exact |
| baseline.steroid = Yes | 42 | 12 | 0.128 | exact |
| other.immuno.drug = Yes | 11 | 0 | 0.377 | exact |
| PUR = PUR | 0 | 37 | <0.001 | exact |
Particularly interested in two a priori suspected confounders of any relationship between vitamin D prescription and PUR: clinic site and ethnicity, so these visualised.
Plots show % of patients with PUR by vitamin D status and by the possible confounding variable. Error bars are standard error for proportion based on binomial distribution.
First, by ethnicity. Ethnicity categories were collapsed into three categories due to low frequencies for some of the categories used for primary data collection. Specifically, South East Asian and West Asian categories, which had few patients, were combined with South Asian.
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity=="Middle.Eastern"] <- "Asian"
dat$ethnicity[dat$ethnicity=="SE.Asian"] <- "Asian"
dat$ethnicity[dat$ethnicity=="South.Asian"] <- "Asian"
XT <- xtabs(~ethnicity+PUR+vitD.Rx, data=dat)
# percentage PUR
percNVdAfr <- round((XT[4]/(XT[4]+XT[1]))*100, 1)
percNVdAsn <- round((XT[5]/(XT[5]+XT[2]))*100, 1)
percNVdWht <- round((XT[6]/(XT[6]+XT[3]))*100, 1)
percVdAfr <- round((XT[10]/(XT[10]+XT[7]))*100, 1)
percVdAsn <- round((XT[11]/(XT[11]+XT[8]))*100, 1)
percVdWht <- round((XT[12]/(XT[12]+XT[9]))*100, 1)
# as proportion for SE calcs
af1 <- (percNVdAfr/100)
as1 <- (percNVdAsn/100)
wh1 <- (percNVdWht/100)
af2 <- (percVdAfr/100)
as2 <- (percVdAsn/100)
wh2 <- (percVdWht/100)
# SE = sqrt(pbar*(1-pbar)/n)
af1SE <- sqrt(af1*(1-af1)/(XT[4]+XT[1]))
as1SE <- sqrt(as1*(1-as1)/(XT[5]+XT[2]))
wh1SE <- sqrt(wh1*(1-wh1)/(XT[6]+XT[3]))
af2SE <- sqrt(af2*(1-af2)/(XT[7]+XT[10]))
as2SE <- sqrt(as2*(1-as2)/(XT[8]+XT[11]))
wh2SE <- sqrt(wh2*(1-wh2)/(XT[9]+XT[12]))
SE <- c(af1SE, as1SE, wh1SE, af2SE, as2SE, wh2SE)
Percent.with.PUR <- c(percNVdAfr, percVdAfr, percNVdAsn, percVdAsn, percNVdWht, percVdWht)
Ethnicity <- c("African", "African", "Asian", "Asian", "White.European", "White.European")
VitaminD <- rep(c("No vitamin D", "Vitamin D"), 3)
EthVit <- data.frame(Ethnicity, VitaminD, Percent.with.PUR, SE)
g <- ggplot(EthVit, aes(x=Ethnicity, y=Percent.with.PUR, fill=Ethnicity))
g + geom_errorbar(aes(ymin=Percent.with.PUR-(0.1),
ymax=Percent.with.PUR+(100*SE)),
width=0.2, colour="black") +
geom_bar(stat="identity", colour="black") + facet_wrap(~VitaminD) +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
scale_fill_brewer(palette = 2) +
ylim(0,100) + guides(fill=FALSE) + xlab("") +
theme(strip.text=element_text(face="bold")) +
ggtitle("Proportion PUR by ethnicity and Vit D status") +
theme(plot.title=element_text(size=rel(1))) +
ylab("Percent with PUR")
Both Vitamin D prescribing behaviour and PUR incidence differed across these ethnic groupings. Patients prescribed vitamin D had higher rates of PUR irrespective of ethnic grouping.
Clinic sites :
A.vd <- sum(df$clinic=="A" & df$vitD.Rx=="VitD.Rx")
A.vd.PUR <- sum(df$clinic=="A" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
A.nvd <- sum(df$clinic=="A" & df$vitD.Rx!="VitD.Rx")
A.nvd.PUR <- sum(df$clinic=="A" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")
A.prop.vd <- A.vd.PUR/A.vd
A.prop.nvd <- A.nvd.PUR/A.nvd
A.prop.vd.SE <- sqrt(A.prop.vd*(1-A.prop.vd)/A.vd)
A.prop.nvd.SE <- sqrt(A.prop.nvd*(1-A.prop.nvd)/A.nvd)
#
B.vd <- sum(df$clinic=="B" & df$vitD.Rx=="VitD.Rx")
B.vd.PUR <- sum(df$clinic=="B" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
B.nvd <- sum(df$clinic=="B" & df$vitD.Rx!="VitD.Rx")
B.nvd.PUR <- sum(df$clinic=="B" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")
B.prop.vd <- B.vd.PUR/B.vd
B.prop.nvd <- B.nvd.PUR/B.nvd
B.prop.vd.SE <- sqrt(B.prop.vd*(1-B.prop.vd)/B.vd)
B.prop.nvd.SE <- sqrt(B.prop.nvd*(1-B.prop.nvd)/B.nvd)
#
C.vd <- sum(df$clinic=="C" & df$vitD.Rx=="VitD.Rx")
C.vd.PUR <- sum(df$clinic=="C" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
C.nvd <- sum(df$clinic=="C" & df$vitD.Rx!="VitD.Rx")
C.nvd.PUR <- sum(df$clinic=="C" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")
C.prop.vd <- C.vd.PUR/C.vd
C.prop.nvd <- C.nvd.PUR/C.nvd
C.prop.vd.SE <- sqrt(C.prop.vd*(1-C.prop.vd)/C.vd)
C.prop.nvd.SE <- sqrt(C.prop.nvd*(1-C.prop.nvd)/C.nvd)
#
D.vd <- sum(df$clinic=="D" & df$vitD.Rx=="VitD.Rx")
D.vd.PUR <- sum(df$clinic=="D" & df$vitD.Rx=="VitD.Rx" & df$PUR=="PUR")
D.nvd <- sum(df$clinic=="D" & df$vitD.Rx!="VitD.Rx")
D.nvd.PUR <- sum(df$clinic=="D" & df$vitD.Rx!="VitD.Rx" & df$PUR=="PUR")
D.prop.vd <- D.vd.PUR/D.vd
D.prop.nvd <- D.nvd.PUR/D.nvd
D.prop.vd.SE <- sqrt(D.prop.vd*(1-D.prop.vd)/D.vd)
D.prop.nvd.SE <- sqrt(D.prop.nvd*(1-D.prop.nvd)/D.nvd)
props <- c(A.prop.vd, A.prop.nvd, B.prop.vd, B.prop.nvd, C.prop.vd, C.prop.nvd, D.prop.vd, D.prop.nvd)
SE <- c(A.prop.vd.SE, A.prop.nvd.SE, B.prop.vd.SE, B.prop.nvd.SE, C.prop.vd.SE, C.prop.nvd.SE, D.prop.vd.SE, D.prop.nvd.SE)
clinic <- c("A", "A", "B", "B", "C", "C", "D", "D")
VitaminD <- rep(c("Vitamin D", "No Vitamin D"), 4)
clinics <- data.frame(clinic, VitaminD, props, SE)
clinics <- dplyr::mutate(clinics, props=100*props)
g <- ggplot(clinics, aes(x=clinic, y=props, fill=clinic))
g + geom_errorbar(aes(ymin=props-(0.1),
ymax=props+(100*SE)),
width=0.2, colour="black") +
geom_bar(stat="identity", colour="black") + facet_wrap(~VitaminD) +
theme(axis.text.x=element_text(angle=45, hjust=1, vjust=1)) +
scale_fill_brewer(palette = 4) +
ylim(0,100) + guides(fill=FALSE) + xlab("Clinic site") +
theme(strip.text=element_text(face="bold")) +
ggtitle("Proportion PUR by clinic and Vit D status") +
theme(plot.title=element_text(size=rel(1))) +
ylab("Percent with PUR")
Clinic site C is an outlier, with lower rates of PUR, lower rates of vitamin D prescription, and is the only clinic site where an increased rate of PUR is not seen for patients prescribed vitamin D. Only 11 patients at clinic C were prescribed vitamin D, and none had a PUR.
Now three way comparisons between PUR and all combinations of 2 variables from set {ethnicity, vitamin D, clinic site, age}.
g <- ggpairs(df,
showStrips = TRUE,
mapping = ggplot2::aes(
color = PUR,
alpha=0.5),
columns = c("ethnicity","clinic", "vitD.Rx", "age"))
g + theme_grey(base_size = 8)
Vitamin D remains associated with PUR after adjusting for ethnicity in a logistic regression model:
vars <- c("PUR","ethnicity","clinic","age", "sex",
"more1site", "microscopy.diagnosis", "culture.diagnosis", "histology.diagnosis",
"baseline.steroid", "other.immuno.drug",
"lymphocytes", "monocytes", "neutrophils", "haemoglobin",
"ESR", "albumin", "CRP","vitD.baseline","vitD.6mths", "vitD.Rx")
dat <- df[ ,vars]
# ethnicity collapsed into 3 categories
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity=="Middle.Eastern"] <- "Asian"
dat$ethnicity[dat$ethnicity=="SE.Asian"] <- "Asian"
dat$ethnicity[dat$ethnicity=="South.Asian"] <- "Asian"
# vit D remains significant after adjusting for ethnicity
summary(glm(PUR~vitD.Rx+ethnicity, data=dat, family="binomial"))
##
## Call:
## glm(formula = PUR ~ vitD.Rx + ethnicity, family = "binomial",
## data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9602 -0.4903 -0.4903 -0.3761 2.3169
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7451 0.5114 -3.412 0.000645 ***
## vitD.RxVitD.Rx 1.2100 0.3789 3.193 0.001407 **
## ethnicityAsian -0.3128 0.5267 -0.594 0.552604
## ethnicityWhite.European -0.8683 0.6779 -1.281 0.200227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 209.29 on 248 degrees of freedom
## Residual deviance: 195.93 on 245 degrees of freedom
## AIC: 203.93
##
## Number of Fisher Scoring iterations: 5
Because there are 0 patients with vitamin D prescription and PUR at clinic C, a standard logistic regression model cannot be fit with clinic as a co-variable. Instead an exact logistic regression model is used here, and shows vitamin D association with PUR is overall independent of clinic site.
datClin <- dat
# collapse to binary clinic variable "clinic C" or "not clinic C"
# as C is the outlier
datClin$clinic <- as.character(datClin$clinic)
datClin$clinic[datClin$clinic!="C"] <- "nonC"
# making a collapsed data frame for exact regression with elrm package
x <- xtabs(~PUR + interaction(clinic, vitD.Rx), data = datClin)
cdat <- cdat <- data.frame(clinic = rep(1:0, 2), vitD.Rx = rep(0:1, each = 2),
PUR = x[2, ], ntrials = colSums(x))
exact.m <- elrm(formula = PUR/ntrials ~ vitD.Rx + clinic,
interest = ~ vitD.Rx + clinic,
iter = 5005000, dataset = cdat, burnIn = 5000, r = 2)
## Progress: 0% Progress: 5% Progress: 10% Progress: 15% Progress: 20% Progress: 25% Progress: 30% Progress: 35% Progress: 40% Progress: 45% Progress: 50% Progress: 55% Progress: 60% Progress: 65% Progress: 70% Progress: 75% Progress: 80% Progress: 85% Progress: 90% Progress: 95% Progress: 100%
summary(exact.m)
##
## Call:
## [[1]]
## elrm(formula = PUR/ntrials ~ vitD.Rx + clinic, interest = ~vitD.Rx +
## clinic, r = 2, iter = 5005000, dataset = cdat, burnIn = 5000)
##
##
## Results:
## estimate p-value p-value_se mc_size
## joint NA 0.00031 0.00001 5000000
## vitD.Rx 1.34424 0.00103 0.00011 83724
## clinic -1.54588 0.04047 0.00556 2545
##
## 95% Confidence Intervals for Parameters
##
## lower upper
## vitD.Rx 0.4707123 2.558588197
## clinic -4.8906887 0.004469184
All variables significant at the <0.05 level included except clinic (due to zero cell problem). Ethnicity also included as important possible confounder for vit D and PUR. Because ~12% of lymphocyte count observations are missing, in a non-random way, these values are imputed using MCMC. Lymphocyte count is transformed to fit more normal distribution.
# imputation on a reduced data frame
set.seed(123)
vars <- c("PUR","ethnicity","clinic","age", "sex",
"more1site", "microscopy.diagnosis", "vitD.Rx",
"baseline.steroid", "other.immuno.drug",
"lymphocytes", "monocytes", "CRP")
dat <- df[ ,vars]
dat <- mice(dat)
##
## iter imp variable
## 1 1 lymphocytes monocytes CRP
## 1 2 lymphocytes monocytes CRP
## 1 3 lymphocytes monocytes CRP
## 1 4 lymphocytes monocytes CRP
## 1 5 lymphocytes monocytes CRP
## 2 1 lymphocytes monocytes CRP
## 2 2 lymphocytes monocytes CRP
## 2 3 lymphocytes monocytes CRP
## 2 4 lymphocytes monocytes CRP
## 2 5 lymphocytes monocytes CRP
## 3 1 lymphocytes monocytes CRP
## 3 2 lymphocytes monocytes CRP
## 3 3 lymphocytes monocytes CRP
## 3 4 lymphocytes monocytes CRP
## 3 5 lymphocytes monocytes CRP
## 4 1 lymphocytes monocytes CRP
## 4 2 lymphocytes monocytes CRP
## 4 3 lymphocytes monocytes CRP
## 4 4 lymphocytes monocytes CRP
## 4 5 lymphocytes monocytes CRP
## 5 1 lymphocytes monocytes CRP
## 5 2 lymphocytes monocytes CRP
## 5 3 lymphocytes monocytes CRP
## 5 4 lymphocytes monocytes CRP
## 5 5 lymphocytes monocytes CRP
dat <- mice::complete(dat)
dat2 <- dat # save for later propensity score matching analysis
# a model with all univariate p<0.05
# + ethnicity as an important potential confounder
# but no 'clinic'
# ethnicity collapsed to one binary variable
dat$ethnicity <- as.character(dat$ethnicity)
dat$ethnicity[dat$ethnicity!="White.European"] <- "nonWhite.European"
full.m <- glm(PUR~vitD.Rx+I(log(lymphocytes+1))+more1site+microscopy.diagnosis+I(log(CRP+1))+ethnicity+I(log(age)),
data=dat, family="binomial")
summary(full.m)
##
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site +
## microscopy.diagnosis + I(log(CRP + 1)) + ethnicity + I(log(age)),
## family = "binomial", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4251 -0.5680 -0.3811 -0.2253 2.8868
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.38913 2.34359 1.446 0.14814
## vitD.Rx2 1.23256 0.40845 3.018 0.00255 **
## I(log(lymphocytes + 1)) -1.36939 0.76119 -1.799 0.07202 .
## more1site2 0.50294 0.40036 1.256 0.20904
## microscopy.diagnosis2 1.11245 0.39725 2.800 0.00510 **
## I(log(CRP + 1)) 0.06174 0.16020 0.385 0.69995
## ethnicityWhite.European -0.05363 0.60390 -0.089 0.92924
## I(log(age)) -1.47126 0.59445 -2.475 0.01332 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 209.29 on 248 degrees of freedom
## Residual deviance: 172.78 on 241 degrees of freedom
## AIC: 188.78
##
## Number of Fisher Scoring iterations: 5
In this ‘full’ model, age, microscopy diagnosis (AFB seen on diagnostic sample), and vitamin D prescription remain significant.
Next a backwards stepwise selection of variables based on Akaike Information Criteria (AIC) impact of removing the variable was performed:
step.m <- stepAIC(full.m, direction = "backward")
## Start: AIC=188.78
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(CRP + 1)) + ethnicity + I(log(age))
##
## Df Deviance AIC
## - ethnicity 1 172.78 186.78
## - I(log(CRP + 1)) 1 172.93 186.93
## - more1site 1 174.35 188.35
## <none> 172.78 188.78
## - I(log(lymphocytes + 1)) 1 176.10 190.10
## - I(log(age)) 1 179.15 193.15
## - microscopy.diagnosis 1 180.68 194.68
## - vitD.Rx 1 181.67 195.67
##
## Step: AIC=186.78
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(CRP + 1)) + I(log(age))
##
## Df Deviance AIC
## - I(log(CRP + 1)) 1 172.94 184.94
## - more1site 1 174.37 186.37
## <none> 172.78 186.78
## - I(log(lymphocytes + 1)) 1 176.11 188.11
## - microscopy.diagnosis 1 180.69 192.69
## - I(log(age)) 1 181.00 193.00
## - vitD.Rx 1 181.72 193.72
##
## Step: AIC=184.94
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(age))
##
## Df Deviance AIC
## - more1site 1 174.69 184.69
## <none> 172.94 184.94
## - I(log(lymphocytes + 1)) 1 177.28 187.28
## - microscopy.diagnosis 1 180.83 190.83
## - I(log(age)) 1 181.35 191.35
## - vitD.Rx 1 181.98 191.98
##
## Step: AIC=184.69
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis +
## I(log(age))
##
## Df Deviance AIC
## <none> 174.69 184.69
## - I(log(lymphocytes + 1)) 1 180.08 188.08
## - microscopy.diagnosis 1 182.54 190.54
## - I(log(age)) 1 183.61 191.61
## - vitD.Rx 1 184.70 192.70
summary(step.m)
##
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis +
## I(log(age)), family = "binomial", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3473 -0.5534 -0.3772 -0.2367 2.8400
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.1887 2.0252 2.068 0.03861 *
## vitD.Rx2 1.2901 0.4033 3.199 0.00138 **
## I(log(lymphocytes + 1)) -1.6228 0.7118 -2.280 0.02262 *
## microscopy.diagnosis2 1.0974 0.3929 2.793 0.00522 **
## I(log(age)) -1.5270 0.5324 -2.868 0.00413 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 209.29 on 248 degrees of freedom
## Residual deviance: 174.69 on 244 degrees of freedom
## AIC: 184.69
##
## Number of Fisher Scoring iterations: 5
In this model, age, microscopy diagnosis (AFB seen on diagnostic sample), vitamin D prescription, and lymphocyte count are significant.
This reduced model comapres favorably to the ‘full’ model. It has similar residual deviance and predictive performance in the data set, and is non-inferior in a -2 log likelihood test (with better AIC score):
PseudoR2(step.m)["Nagelkerke"]
## Nagelkerke
## 0.2282075
PseudoR2(full.m)["Nagelkerke"]
## Nagelkerke
## 0.2399237
PseudoR2(step.m)["AIC"]
## AIC
## 184.6892
PseudoR2(full.m)["AIC"]
## AIC
## 188.7761
step.m.p <- predict(step.m, type = "response")
full.m.p <- predict(full.m, type = "response")
roc(dat$PUR, step.m.p, data=dat)$auc
## Area under the curve: 0.7743
roc(dat$PUR, full.m.p, data=dat)$auc
## Area under the curve: 0.7815
anova(full.m, step.m, test ="Chisq")
## Analysis of Deviance Table
##
## Model 1: PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(CRP + 1)) + ethnicity + I(log(age))
## Model 2: PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis +
## I(log(age))
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 241 172.78
## 2 244 174.69 -3 -1.9131 0.5906
An automated search for an improved model including all possible two way interactions between age, microscopy diagnosis (AFB seen on diagnostic sample), vitamin D prescription, and lymphocyte count was performed. The best model including interaction terms was not significantly better than the previous stepwise model.
search <- step(step.m, ~.^2)
## Start: AIC=184.69
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis +
## I(log(age))
##
## Df Deviance AIC
## + I(log(lymphocytes + 1)):microscopy.diagnosis 1 172.56 184.56
## <none> 174.69 184.69
## + microscopy.diagnosis:I(log(age)) 1 173.73 185.73
## + vitD.Rx:microscopy.diagnosis 1 173.90 185.90
## + vitD.Rx:I(log(age)) 1 174.54 186.54
## + I(log(lymphocytes + 1)):I(log(age)) 1 174.57 186.57
## + vitD.Rx:I(log(lymphocytes + 1)) 1 174.68 186.68
## - I(log(lymphocytes + 1)) 1 180.08 188.08
## - microscopy.diagnosis 1 182.54 190.54
## - I(log(age)) 1 183.61 191.61
## - vitD.Rx 1 184.70 192.70
##
## Step: AIC=184.56
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + microscopy.diagnosis +
## I(log(age)) + I(log(lymphocytes + 1)):microscopy.diagnosis
##
## Df Deviance AIC
## <none> 172.56 184.56
## - I(log(lymphocytes + 1)):microscopy.diagnosis 1 174.69 184.69
## + vitD.Rx:microscopy.diagnosis 1 171.57 185.57
## + microscopy.diagnosis:I(log(age)) 1 171.95 185.95
## + I(log(lymphocytes + 1)):I(log(age)) 1 172.12 186.12
## + vitD.Rx:I(log(age)) 1 172.49 186.49
## + vitD.Rx:I(log(lymphocytes + 1)) 1 172.54 186.54
## - I(log(age)) 1 182.02 192.02
## - vitD.Rx 1 182.35 192.35
anova(search)
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: PUR
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df
## NULL 248
## vitD.Rx 1 11.5120 247
## I(log(lymphocytes + 1)) 1 6.1153 246
## microscopy.diagnosis 1 8.0510 245
## I(log(age)) 1 8.9235 244
## I(log(lymphocytes + 1)):microscopy.diagnosis 1 2.1312 243
## Resid. Dev
## NULL 209.29
## vitD.Rx 197.78
## I(log(lymphocytes + 1)) 191.66
## microscopy.diagnosis 183.61
## I(log(age)) 174.69
## I(log(lymphocytes + 1)):microscopy.diagnosis 172.56
Using the stepwise model without interaction terms, cases with disproportionate influence were identified (defined as Cook’s distance > 4 standard deviations). These 7 cases were removed and full and stepwise models were rerun to give a final model.
summary(cooks.distance(step.m))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000094 0.0000883 0.0003998 0.0042930 0.0024280 0.0666300
cutoff <- sd(cooks.distance(step.m))*4
plot(cooks.distance(step.m))
abline(h=cutoff, col="darkgreen")
dat.reduced <- dat[cooks.distance(step.m)<cutoff,]
cooksAdj.full.m <- glm(PUR~vitD.Rx+I(log(lymphocytes+1))+more1site+microscopy.diagnosis+I(log(CRP+1))+ethnicity+I(log(age)),
data=dat.reduced, family="binomial")
cooksAdj.step.m <- stepAIC(cooksAdj.full.m, direction = "backward")
## Start: AIC=151.37
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(CRP + 1)) + ethnicity + I(log(age))
##
## Df Deviance AIC
## - ethnicity 1 135.43 149.43
## - I(log(CRP + 1)) 1 135.45 149.45
## <none> 135.37 151.37
## - more1site 1 138.53 152.53
## - I(log(lymphocytes + 1)) 1 140.89 154.89
## - vitD.Rx 1 141.66 155.66
## - microscopy.diagnosis 1 144.31 158.31
## - I(log(age)) 1 145.92 159.92
##
## Step: AIC=149.43
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(CRP + 1)) + I(log(age))
##
## Df Deviance AIC
## - I(log(CRP + 1)) 1 135.53 147.53
## <none> 135.43 149.43
## - more1site 1 138.63 150.63
## - I(log(lymphocytes + 1)) 1 140.96 152.96
## - vitD.Rx 1 141.77 153.77
## - microscopy.diagnosis 1 144.31 156.31
## - I(log(age)) 1 149.09 161.09
##
## Step: AIC=147.53
## PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site + microscopy.diagnosis +
## I(log(age))
##
## Df Deviance AIC
## <none> 135.53 147.53
## - more1site 1 138.94 148.94
## - vitD.Rx 1 141.91 151.91
## - I(log(lymphocytes + 1)) 1 142.33 152.33
## - microscopy.diagnosis 1 144.36 154.36
## - I(log(age)) 1 149.50 159.50
final.m <- cooksAdj.step.m
summary(final.m)
##
## Call:
## glm(formula = PUR ~ vitD.Rx + I(log(lymphocytes + 1)) + more1site +
## microscopy.diagnosis + I(log(age)), family = "binomial",
## data = dat.reduced)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5126 -0.4609 -0.2795 -0.1544 2.5752
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.7808 2.6035 2.605 0.009200 **
## vitD.Rx2 1.1874 0.4646 2.556 0.010588 *
## I(log(lymphocytes + 1)) -2.2648 0.9027 -2.509 0.012108 *
## more1site2 0.8309 0.4533 1.833 0.066827 .
## microscopy.diagnosis2 1.3175 0.4494 2.932 0.003368 **
## I(log(age)) -2.3408 0.6793 -3.446 0.000569 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 181.38 on 241 degrees of freedom
## Residual deviance: 135.53 on 236 degrees of freedom
## AIC: 147.53
##
## Number of Fisher Scoring iterations: 6
with(final.m, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE))
## [1] 9.716045e-09
final.m.p <- predict(final.m, type = "response")
roc(dat.reduced$PUR, final.m.p, data=dat.reduced)$auc
## Area under the curve: 0.8437
PseudoR2(final.m)["Nagelkerke"]
## Nagelkerke
## 0.3272972
PseudoR2(final.m)["AIC"]
## AIC
## 147.5254
As seen above clinic site was related to both probability of vitamin D prescribing and probability of PUR occurance (hence a significant risk is a confounder), but could not be included in the final logistic regression model due to zero cell occurence. As an additional check that results are robust despite this problem, here a propensity score analysis is performed where cases are matched on propensity for vitamin D to be prescribed, using covariates that were related to both vitamin D prescription and PUR outcome. Using the (smaller) matched dataset an assocation between vitamin D prescribing and PUR is still found.
### *** uses a earlier version of dat before ethnicity collapsed to binary
# how much do the co-factors influence vitamin D Rx propensity / probability?
vitDRx.m <- glm(vitD.Rx~age+ethnicity+clinic, data=dat2,
family="binomial")
summary(vitDRx.m)
##
## Call:
## glm(formula = vitD.Rx ~ age + ethnicity + clinic, family = "binomial",
## data = dat2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5549 -0.6141 -0.4020 -0.1956 2.2777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.64112 0.77109 -3.425 0.000614 ***
## age -0.00149 0.01308 -0.114 0.909280
## ethnicity2 1.84059 1.62751 1.131 0.258089
## ethnicity3 -1.36501 1.18718 -1.150 0.250228
## ethnicity4 0.21478 0.50573 0.425 0.671056
## ethnicity5 -1.19302 0.66990 -1.781 0.074929 .
## clinic2 2.33300 0.48413 4.819 1.44e-06 ***
## clinic3 1.72506 0.55254 3.122 0.001796 **
## clinic4 2.94495 0.57766 5.098 3.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 267.91 on 248 degrees of freedom
## Residual deviance: 216.06 on 240 degrees of freedom
## AIC: 234.06
##
## Number of Fisher Scoring iterations: 5
library(BaylorEdPsych)
PseudoR2(vitDRx.m)["Nagelkerke"]
## Nagelkerke
## 0.2852224
vitDRx.m.p <- predict(vitDRx.m, type = "response")
library(pROC)
roc(dat2$vitD.Rx, vitDRx.m.p, data=dat2)$auc
## Area under the curve: 0.7908
### propensity score matching with MatchIt
library(MatchIt)
# grouping variable is dat$vitD.Rx
# matching variables chosen : age, clinic, ethnicity
# needs binary or boolean grouping variable
dat2$vitD.Rx <- as.character(dat2$vitD.Rx)
dat2$vitD.Rx[dat2$vitD.Rx=="Nil.vitD.Rx"] <- 0
dat2$vitD.Rx[dat2$vitD.Rx=="VitD.Rx"] <- 1
dat2$vitD.Rx <- as.numeric(dat2$vitD.Rx)
#matching treatments:controls in 1:1 ratio
m.out <- matchit(vitD.Rx ~ age + clinic + ethnicity,
data = dat2, method = "nearest", ratio = 1)
summary(m.out)
##
## Call:
## matchit(formula = vitD.Rx ~ age + clinic + ethnicity, data = dat2,
## method = "nearest", ratio = 1)
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.3856 0.1824 0.1708 0.2032 0.2024
## age 37.3158 41.9333 16.6348 -4.6175 4.7000
## clinicA 0.1228 0.5052 0.5013 -0.3824 0.0000
## clinicB 0.4211 0.1979 0.3995 0.2231 0.0000
## clinicC 0.1930 0.2188 0.4145 -0.0258 0.0000
## clinicD 0.2632 0.0781 0.2691 0.1850 0.0000
## ethnicity2 0.0175 0.0052 0.0722 0.0123 0.0000
## ethnicity3 0.0175 0.0521 0.2228 -0.0345 0.0000
## ethnicity4 0.6842 0.5729 0.4959 0.1113 0.0000
## ethnicity5 0.1228 0.2760 0.4482 -0.1532 0.0000
## eQQ Mean eQQ Max
## distance 0.2056 0.3775
## age 4.8105 13.5000
## clinicA 0.3860 1.0000
## clinicB 0.2281 1.0000
## clinicC 0.0351 1.0000
## clinicD 0.1754 1.0000
## ethnicity2 0.0000 0.0000
## ethnicity3 0.0351 1.0000
## ethnicity4 0.1228 1.0000
## ethnicity5 0.1579 1.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.3856 0.3750 0.1688 0.0106 0.0013
## age 37.3158 37.0596 13.8588 0.2561 1.4000
## clinicA 0.1228 0.1053 0.3096 0.0175 0.0000
## clinicB 0.4211 0.4386 0.5006 -0.0175 0.0000
## clinicC 0.1930 0.2456 0.4343 -0.0526 0.0000
## clinicD 0.2632 0.2105 0.4113 0.0526 0.0000
## ethnicity2 0.0175 0.0000 0.0000 0.0175 0.0000
## ethnicity3 0.0175 0.0175 0.1325 0.0000 0.0000
## ethnicity4 0.6842 0.7368 0.4443 -0.0526 0.0000
## ethnicity5 0.1228 0.1228 0.3311 0.0000 0.0000
## eQQ Mean eQQ Max
## distance 0.0111 0.096
## age 2.0737 7.500
## clinicA 0.0175 1.000
## clinicB 0.0175 1.000
## clinicC 0.0526 1.000
## clinicD 0.0526 1.000
## ethnicity2 0.0175 1.000
## ethnicity3 0.0000 0.000
## ethnicity4 0.0526 1.000
## ethnicity5 0.0000 0.000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 94.7811 99.3405 94.6230 74.5751
## age 94.4529 70.2128 56.8928 44.4444
## clinicA 95.4122 0.0000 95.4545 0.0000
## clinicB 92.1376 0.0000 92.3077 0.0000
## clinicC -104.2553 0.0000 -50.0000 0.0000
## clinicD 71.5556 0.0000 70.0000 0.0000
## ethnicity2 -42.2222 0.0000 -Inf -Inf
## ethnicity3 100.0000 0.0000 100.0000 100.0000
## ethnicity4 52.7094 0.0000 57.1429 0.0000
## ethnicity5 100.0000 0.0000 100.0000 100.0000
##
## Sample sizes:
## Control Treated
## All 192 57
## Matched 57 57
## Unmatched 135 0
## Discarded 0 0
# plot(m.out, type = "jitter")
plot(m.out, type = "hist")
# extract the matched data
m.data1 <- match.data(m.out)
# check vit D Rx still associated with PUR after matching
matched.m <- glm(PUR~vitD.Rx, data=m.data1, family="binomial")
summary(matched.m)
##
## Call:
## glm(formula = PUR ~ vitD.Rx, family = "binomial", data = m.data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8416 -0.8416 -0.5500 -0.5500 1.9817
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8124 0.3813 -4.753 2.01e-06 ***
## vitD.Rx 0.9567 0.4788 1.998 0.0457 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 119.93 on 113 degrees of freedom
## Residual deviance: 115.71 on 112 degrees of freedom
## AIC: 119.71
##
## Number of Fisher Scoring iterations: 4
exp(confint(matched.m))
## 2.5 % 97.5 %
## (Intercept) 0.07150922 0.3254047
## vitD.Rx 1.04469433 6.9629099
exp(coef(matched.m))
## (Intercept) vitD.Rx
## 0.1632653 2.6031250
#matching treatments:controls in 1:2 ratio
m.out <- matchit(vitD.Rx ~ age + clinic + ethnicity,
data = dat2, method = "nearest", ratio = 2)
summary(m.out)
##
## Call:
## matchit(formula = vitD.Rx ~ age + clinic + ethnicity, data = dat2,
## method = "nearest", ratio = 2)
##
## Summary of balance for all data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.3856 0.1824 0.1708 0.2032 0.2024
## age 37.3158 41.9333 16.6348 -4.6175 4.7000
## clinicA 0.1228 0.5052 0.5013 -0.3824 0.0000
## clinicB 0.4211 0.1979 0.3995 0.2231 0.0000
## clinicC 0.1930 0.2188 0.4145 -0.0258 0.0000
## clinicD 0.2632 0.0781 0.2691 0.1850 0.0000
## ethnicity2 0.0175 0.0052 0.0722 0.0123 0.0000
## ethnicity3 0.0175 0.0521 0.2228 -0.0345 0.0000
## ethnicity4 0.6842 0.5729 0.4959 0.1113 0.0000
## ethnicity5 0.1228 0.2760 0.4482 -0.1532 0.0000
## eQQ Mean eQQ Max
## distance 0.2056 0.3775
## age 4.8105 13.5000
## clinicA 0.3860 1.0000
## clinicB 0.2281 1.0000
## clinicC 0.0351 1.0000
## clinicD 0.1754 1.0000
## ethnicity2 0.0000 0.0000
## ethnicity3 0.0351 1.0000
## ethnicity4 0.1228 1.0000
## ethnicity5 0.1579 1.0000
##
##
## Summary of balance for matched data:
## Means Treated Means Control SD Control Mean Diff eQQ Med
## distance 0.3856 0.2654 0.1782 0.1202 0.1427
## age 37.3158 42.1219 17.5984 -4.8061 5.8000
## clinicA 0.1228 0.1667 0.3743 -0.0439 0.0000
## clinicB 0.4211 0.3333 0.4735 0.0877 0.0000
## clinicC 0.1930 0.3684 0.4845 -0.1754 0.0000
## clinicD 0.2632 0.1316 0.3395 0.1316 0.0000
## ethnicity2 0.0175 0.0088 0.0937 0.0088 0.0000
## ethnicity3 0.0175 0.0526 0.2243 -0.0351 0.0000
## ethnicity4 0.6842 0.5088 0.5021 0.1754 0.0000
## ethnicity5 0.1228 0.3246 0.4703 -0.2018 0.0000
## eQQ Mean eQQ Max
## distance 0.1236 0.2842
## age 5.3386 12.2000
## clinicA 0.0351 1.0000
## clinicB 0.0877 1.0000
## clinicC 0.1754 1.0000
## clinicD 0.1404 1.0000
## ethnicity2 0.0000 0.0000
## ethnicity3 0.0351 1.0000
## ethnicity4 0.1754 1.0000
## ethnicity5 0.1930 1.0000
##
## Percent Balance Improvement:
## Mean Diff. eQQ Med eQQ Mean eQQ Max
## distance 40.8360 29.5173 39.8790 24.7025
## age -4.0843 -23.4043 -10.9774 9.6296
## clinicA 88.5305 0.0000 90.9091 0.0000
## clinicB 60.6880 0.0000 61.5385 0.0000
## clinicC -580.8511 0.0000 -400.0000 0.0000
## clinicD 28.8889 0.0000 20.0000 0.0000
## ethnicity2 28.8889 0.0000 0.0000 0.0000
## ethnicity3 -1.5873 0.0000 0.0000 0.0000
## ethnicity4 -57.6355 0.0000 -42.8571 0.0000
## ethnicity5 -31.6637 0.0000 -22.2222 0.0000
##
## Sample sizes:
## Control Treated
## All 192 57
## Matched 114 57
## Unmatched 78 0
## Discarded 0 0
# plot(m.out, type = "jitter")
plot(m.out, type = "hist")
# extract the matched data
m.data1 <- match.data(m.out)
# check vit D Rx still associated with PUR after matching
matched.m <- glm(PUR~vitD.Rx, data=m.data1, family="binomial")
summary(matched.m)
##
## Call:
## glm(formula = PUR ~ vitD.Rx, family = "binomial", data = m.data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8416 -0.5312 -0.5312 -0.5312 2.0140
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.8871 0.2771 -6.811 9.7e-12 ***
## vitD.Rx 1.0314 0.4007 2.574 0.0101 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 164.86 on 170 degrees of freedom
## Residual deviance: 158.25 on 169 degrees of freedom
## AIC: 162.25
##
## Number of Fisher Scoring iterations: 4
exp(confint(matched.m))
## 2.5 % 97.5 %
## (Intercept) 0.0845356 0.2524428
## vitD.Rx 1.2801606 6.2206071
exp(coef(matched.m))
## (Intercept) vitD.Rx
## 0.1515152 2.8050000