knitr::opts_chunk$set(echo = TRUE,
fig.align = 'center')
# Loading needed packages
pacman::p_load(readxl, tidyverse, car, lme4, lmerTest, doBy, gee, geepack)
# Custom theme for text
text_theme <- theme(axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.title = element_text(size = 16, hjust = 0.5),
plot.subtitle = element_text(size = 12, hjust = 0.5),
plot.caption = element_text(size = 8))
toe <-
read_excel("toe fungues.xlsx") |>
# Cleaning the data
janitor::clean_names() |>
mutate(
subject_id = factor(subject_id),
severe = response,
response = factor(response, labels = c('none/mild', 'moderate/severe')),
treatment = factor(treatment, labels = c("I", "T")),
visit0 = factor(visit - 1) # Number of visits after the initial visit
)
tibble(toe)
## # A tibble: 1,908 × 7
## subject_id response treatment month visit severe visit0
## <fct> <fct> <fct> <dbl> <dbl> <dbl> <fct>
## 1 1 moderate/severe T 0 1 1 0
## 2 1 moderate/severe T 0.857 2 1 1
## 3 1 moderate/severe T 3.54 3 1 2
## 4 1 none/mild T 4.54 4 0 3
## 5 1 none/mild T 7.54 5 0 4
## 6 1 none/mild T 10.0 6 0 5
## 7 1 none/mild T 13.1 7 0 6
## 8 2 none/mild I 0 1 0 0
## 9 2 none/mild I 0.964 2 0 1
## 10 2 moderate/severe I 2 3 1 2
## # ℹ 1,898 more rows
The toe data set has 5 variables on over 1900
measurements of patients with a certain toe fungus. Each patient takes
one of two different medications and reports monthly (ish). Each visit
the severity of their condition is recorded.
The five variables are:
subject_id: The unique ID number for each
patient
response: The severity of the condition
treatment: Which of the two treatments the patient
is assigned
month: Time in months since the start of the
experiment
visit0: The visit number for that measurement with
initial visit as 0.
summary(toe)
## subject_id response treatment month
## 1 : 7 none/mild :1500 I:937 Min. : 0.000
## 3 : 7 moderate/severe: 408 T:971 1st Qu.: 1.000
## 4 : 7 Median : 3.000
## 6 : 7 Mean : 4.691
## 7 : 7 3rd Qu.: 8.893
## 9 : 7 Max. :18.500
## (Other):1866
## visit severe visit0
## Min. :1.000 Min. :0.0000 0:294
## 1st Qu.:2.000 1st Qu.:0.0000 1:288
## Median :4.000 Median :0.0000 2:283
## Mean :3.896 Mean :0.2138 3:272
## 3rd Qu.:6.000 3rd Qu.:0.0000 4:263
## Max. :7.000 Max. :1.0000 5:244
## 6:264
Each patient is scheduled to visit after a month for the first three visits, then once every three months for the last 3 visits, but that doesn’t always (almost never) happens. So let’s look at the average time and standard deviation of each visit in months:
toe |>
summarize(
.by = visit0,
month_avg = mean(month),
month_sd = sd(month)
)
## # A tibble: 7 × 3
## visit0 month_avg month_sd
## <fct> <dbl> <dbl>
## 1 0 0 0
## 2 1 1.03 0.115
## 3 2 2.06 0.202
## 4 3 3.13 0.273
## 5 4 6.28 0.541
## 6 5 9.31 0.656
## 7 6 12.5 0.895
While the average is close to the scheduled time, the spread in time between visits grows the longer the study goes on (less consistently showing during the scheduled appointment).
Let’s start by creating some data visualizations to get an idea of how the data look
ggplot(
data = toe,
mapping = aes(
x = month,
y = response,
fill = treatment
)
) +
geom_boxplot() +
theme_bw()
The basic box plot doesn’t indicate much of a difference between the two treatments on severity over time. Let’s look at the ‘success percent’ (none/mild) by visit and treatment, ignoring the subject
toe |>
mutate(
.by = visit0,
avg_month = round(mean(month))
) |>
# Counting the number of patients at each visit/response/treatment combo
summarize(
.by = c(avg_month, treatment, response),
n_obs = n()
) |>
# Proportion of patients with each response for each visit treatment combo
mutate(
.by = c(avg_month, treatment),
bad_prop = n_obs/sum(n_obs)
) |>
# only moderate and severe rows
filter(
response == 'moderate/severe'
) |>
# Line Plot
ggplot(
mapping = aes(
x = avg_month,
y = bad_prop,
color = treatment,
weight = n_obs
)
) +
geom_point(
size=2,
show.legend = F
) +
geom_line() +
scale_x_continuous(
breaks = toe |>
summarize(.by = visit0, month_avg = mean(month)) |>
pull(month_avg) |>
round(),
minor_breaks = NULL
) +
labs(
x = "Months after Initival Visit",
y = "Proportion with Moderate or Severe Separation",
title = "Proportion of Patients with Moderate or Severe Separation",
subtitle = "By Number of Visits and Treatment",
color = "Medication"
) +
theme_bw() +
text_theme +
theme(
legend.position = 'inside',
legend.position.inside = c(0.9, 0.85),
legend.key = element_rect(fill=NA)
) +
scale_color_discrete(
labels = c("Itraconzaole", "Terbinafine"),
guide= guide_legend(reverse=T)
)
We can see at the start of the study that both medications had about a 38% moderate or severe separation rate. After the subjects started taking their randomly assigned medication, Terbinafine performed better during each month, but both leveled out after 6 months, not seeing substantial improvement after then.
This graph ignores that the data are repeated measurements on 294 unique subjects. We can use a Generalized Estimating Equation (GEE) to estimate the effect of the mediation of separation severity, accounting for the correlation structure for each patient!
Important: We’ll look at the number of missing cases per visit, but we won’t include the missingness in our analysis
toe |>
# Long way of including a missing value in long format data :(
# Pivot to wide to include if the value is missing
pivot_wider(
id_cols = c(subject_id, treatment),
values_from = response,
names_from = visit0
) |>
# Pivot back to long
pivot_longer(
cols = c(-subject_id, -treatment),
names_to = 'visit0',
values_to = 'response'
) |>
# Calculating the missing percentage of each combo of treatment and visit
mutate(
.by = c(treatment, visit0),
missing_prop = mean(is.na(response))
) |>
# line graph of the missing percentage of visit
ggplot(
mapping = aes(
x = visit0,
y = missing_prop,
color = treatment,
group = treatment
)
) +
geom_line(
linewidth = 1
) +
labs(
x = "Visit Number",
y = "Proportion of No-Show Patients",
title = "Proportion of No-Show Patients by Treatment and Visit",
color = "Medication"
) +
theme_bw() +
text_theme +
theme(
legend.position = 'inside',
legend.position.inside = c(0.8, 0.15),
legend.key = element_rect(fill = NA)
) +
scale_color_discrete(
labels = c("Itraconzaole", "Terbinafine"),
guide= guide_legend(reverse = T)
)
The fifth visit had almost 20% of the nearly 300 patients not show up. It would be a good idea to follow up with them to see why they missed the fifth visit, especially for those who arrived for the final visit.
geeglm()We can use the function geeglm() to fit the “model” a
very similar way we would using glm() with
formula, family, and data
It also has some additional arguments that we need to supply:
id = column that stores the cluster id
subject_idstd.err = "san.se": tells it to calculate the
standard errors using the sandwich estimator
corstr = the correlation structure to use
not specifying assumes an independent model (ie all correlations = 0)
for clustered data (like children in classrooms), we use ‘exchangable’ since each individual in a cluster are exchangable with anyone else in the cluster
for longitudinal data, we typically use ‘ar1’ since observations near each other in time are more similar than those far apart in time
toe_indep <-
geeglm(
formula = severe ~ treatment * month,
family = binomial(link = "logit"),
id = subject_id,
data = toe,
std.err = "san.se",
corstr = 'independence',
scale.fix = T # Assumes a 'constant' variance
)
# Estimates and results
broom::tidy(toe_indep)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.557 0.171 10.6 0.00115
## 2 treatmentT -0.000582 0.251 0.00000538 0.998
## 3 month -0.170 0.0292 34.1 0.00000000522
## 4 treatmentT:month -0.0672 0.0521 1.66 0.197
# Fit stats
broom::glance(toe_indep)
## # A tibble: 1 × 5
## df.residual n.clusters max.cluster.size alpha gamma
## <int> <int> <int> <dbl> <dbl>
## 1 1904 294 7 NA 1.04
How does it compare to a fixed glm? Let’s take a look!
glm(
formula = severe ~ treatment * month,
family = binomial(link="logit"),
data = toe
) |>
broom::tidy()
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.557 0.109 -5.11 3.25e- 7
## 2 treatmentT -0.000582 0.156 -0.00373 9.97e- 1
## 3 month -0.170 0.0236 -7.21 5.58e-13
## 4 treatmentT:month -0.0672 0.0375 -1.79 7.32e- 2
Note the estimates are the same (yay!), but the standard errors are different (boo!)
Let’s look at a couple of different models
We can easily update a saved model in R using the
update() function. You give it a model and the arguments of
the original function you want to change.
If we want to update the correlation structure, we can use
update()!
toe_ex <- update(toe_indep, corstr = "exchangeable")
broom::tidy(toe_ex)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.584 0.172 11.5 0.000691
## 2 treatmentT 0.00681 0.260 0.000683 0.979
## 3 month -0.171 0.0301 32.5 0.0000000121
## 4 treatmentT:month -0.0781 0.0543 2.07 0.150
broom::glance(toe_ex)
## # A tibble: 1 × 5
## df.residual n.clusters max.cluster.size alpha gamma
## <int> <int> <int> <dbl> <dbl>
## 1 1904 294 7 0.441 1.04
toe_ar <- update(toe_indep, corstr= "ar1")
broom::tidy(toe_ar)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.642 0.168 14.5 0.000137
## 2 treatmentT 0.0677 0.251 0.0725 0.788
## 3 month -0.138 0.0274 25.5 0.000000432
## 4 treatmentT:month -0.0957 0.0515 3.45 0.0631
broom::glance(toe_ar)
## # A tibble: 1 × 5
## df.residual n.clusters max.cluster.size alpha gamma
## <int> <int> <int> <dbl> <dbl>
## 1 1904 294 7 0.675 1.04
To build an unstructured correlation matrix, we can use
corstr = 'unstructured'.
toe_un <- update(toe_indep, corstr = "unstructured")
broom::tidy(toe_un)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.737 0.166 19.6 0.00000954
## 2 treatmentT 0.0367 0.246 0.0222 0.882
## 3 month -0.132 0.0264 25.2 0.000000518
## 4 treatmentT:month -0.0900 0.0483 3.47 0.0624
Let’s compare the test statistics of all 4 models:
data.frame(
Independence = coef(toe_indep) / (vcov(toe_indep) |> diag() |> sqrt()),
exchangeable = coef(toe_ex) / (vcov(toe_ex) |> diag() |> sqrt()),
AR1 = coef(toe_ar) / (vcov(toe_ar) |> diag() |> sqrt()),
Unstructured = coef(toe_un) / (vcov(toe_un) |> diag() |> sqrt())
)
## Independence exchangeable AR1 Unstructured
## (Intercept) -3.25188211 -3.39308188 -3.8137693 -4.427395
## treatmentT -0.00231878 0.02612795 0.2691934 0.148830
## month -5.83995826 -5.69828761 -5.0541191 -5.019508
## treatmentT:month -1.28985769 -1.43868806 -1.8585728 -1.863199
While none of the results drastically disagree with one another, which correlation structure should we use?
Like we do with other model selection, we can use an Information Criterion to help decide!
QIC() to compare which correlation structure is
preferredQuasi Information Criterion (QIC) is a recommended
method to compare models with the same mean structure (same formula
provided by geeglm()) but different
cor_str.
If we want to compare models with different mean structures but the same correlation structure, we can use QICu (which we aren’t, but in case you want to!)
QIC(toe_indep, toe_ex, toe_ar, toe_un) |>
data.frame()
## QIC QICu Quasi.Lik CIC params QICC
## toe_indep 1836.731 1824.015 -908.0075 10.357955 4 1836.869
## toe_ex 1838.471 1824.693 -908.3463 10.889287 4 1838.679
## toe_ar 1838.033 1826.013 -909.0066 10.009813 4 1838.241
## toe_un 1840.188 1829.012 -910.5062 9.587764 4 1845.039
Like other information criterion, we want to use ones that have a smaller value.
According to QIC, the independent model is the “best”, followed by the exchangable and arima models, then the unstructured.
Despite the results of QIC, we’ll move forward with the arima model since it is likely that the responses for each individual are associated and it is the typical model used for longitudinal data.
We’ll fit three different mean structures for the AR1 model:
Interaction, Additive, and Month only. We already have the interaction
model, toe_ar. Now we just need to fit the other 2
We can remove the interaction term using update() and
-treatment:month. The : in a formula indicates
just the interaction term. Otherwise using A*B will remove
A, B, and the interaction!
toe_add_ar1 <- update(toe_ar, formula = . ~ .- treatment:month)
toe_mon_ar1 <- update(toe_ar, formula = severe ~ month)
We can compare the results of the different models using
anova():
anova(toe_ar, toe_mon_ar1)
## Analysis of 'Wald statistic' Table
##
## Model 1 severe ~ treatment * month
## Model 2 severe ~ month
## Df X2 P(>|Chi|)
## 1 2 4.0938 0.1291
No strong evidence of an effect of treatment over time
#Seeing if we need to include treatment compared to the interaction model anova(toe_ar, toe_ARMon)
We can create a confidence interval for each parameter using the
esticon() function in the doBy package
esticon() takes a model, linear combination matrix, and
additionally can specify a vector of \(H_0\) values to test the combinations
But we don’t need beta0 to create a confidence interval for model effects
Can also specify joint.test = T if you’d like to test
all of the hypotheses using an omnibus (one combined) test. Use
F if you want to test individual combinations
# Getting the vector of coefficients for the ar model
gee_ar_coef <- coef(toe_ar)
# Getting the comparison matrix (a diagonal matrix where each row corresponds to 1 parameter)
diag_comp_mat <- diag(length(gee_ar_coef))
diag_comp_mat
## [,1] [,2] [,3] [,4]
## [1,] 1 0 0 0
## [2,] 0 1 0 0
## [3,] 0 0 1 0
## [4,] 0 0 0 1
esticon() gives the estimates and the standard errors of
the parameters if we use a diagonal comparison matrix, which is what we
created in diag_comp_mat
model_est <-
esticon(
obj = toe_ar,
L = diag_comp_mat,
beta0 = rep(0, length(gee_ar_coef)) # Vector of 0s for the null hypotheses
)
# Adding the names of the effects to the rows
row.names(model_est) <- names(gee_ar_coef)
round(model_est,4)
## estimate std.error statistic p.value beta0 df
## (Intercept) -0.6419 0.1683 14.5448 0.0001 0.0000 1
## treatmentT 0.0677 0.2515 0.0725 0.7878 0.0000 1
## month -0.1383 0.0274 25.5441 0.0000 0.0000 1
## treatmentT:month -0.0957 0.0515 3.4543 0.0631 0.0000 1
If you want a confidence interval, we can extract the upper and lower
bounds of the confidence intervals using obj$lwr and
obj$upr
model_odds_CI <-
data.frame(
Estimate = model_est$estimate,
Lower = model_est$lwr,
Upper = model_est$upr
) |>
# Exponentiating to calculate the odds ratio
exp()
row.names(model_odds_CI) <- row.names(model_est)
round(model_odds_CI,4)
## Estimate Lower Upper
## (Intercept) 0.5263 0.3784 0.7320
## treatmentT 1.0700 0.6537 1.7516
## month 0.8708 0.8253 0.9188
## treatmentT:month 0.9088 0.8216 1.0052
We can also look at combinations of factors as well. Let’s check 3 Combinations using the Interaction Model:
The odds ratio of Moderate or Severe Separation (MSS) for the two treatments after 6 months
The odds a person taking itraconzaole has moderate or severe separation after 6 months (intercept + slope*6)
The odds ratio that someone taking terbinafine has MSS a month later (slope + interaction)
# Combination matrix (cm):
toe_comp_mat <-
rbind(
c(0,1,0,6), # Odds ratio of MSS after 6 Months for Terb vs Itra
c(0,0,6,0), # Odds ratio of itraconzaole user has MSS after 6 months
c(0,0,1,1) # Odds ratio of terbinafine user has MSS after an additional month
)
toe_comp_mat
## [,1] [,2] [,3] [,4]
## [1,] 0 1 0 6
## [2,] 0 0 6 0
## [3,] 0 0 1 1
# CI for the additive model
toe_est <- esticon(toe_ar, L = toe_comp_mat)
row.names(toe_est) <- c("OR Trt: 6 Months", "OR Itra: 6 Months", "OR Terb: Month")
round(toe_est,4) |> data.frame()
## estimate std.error statistic p.value beta0 df lwr upr
## OR Trt: 6 Months -0.5063 0.2807 3.2539 0.0713 0 1 -1.0565 0.0438
## OR Itra: 6 Months -0.8299 0.1642 25.5441 0.0000 0 1 -1.1518 -0.5081
## OR Terb: Month -0.2340 0.0436 28.8076 0.0000 0 1 -0.3194 -0.1485
# Can get the upper and lower estimates using \$lwr and $upr
toe_odds_CI <-
data.frame(
Estimate = toe_est$estimate,
Lower = toe_est$lwr,
Upper = toe_est$upr
) |>
exp()
row.names(toe_odds_CI) <- row.names(toe_est)
round(toe_odds_CI,4)
## Estimate Lower Upper
## OR Trt: 6 Months 0.6027 0.3477 1.0448
## OR Itra: 6 Months 0.4361 0.3161 0.6016
## OR Terb: Month 0.7914 0.7266 0.8620
trt_label <- c("Itraconzaole", "Terbinafine")
names(trt_label) <- c("I","T")
# Scatterplot for success pct by visit and treatment, ignoring subject
data.frame(
toe,
gee_MSS = predict(toe_ar, type = "response"),
logit_MSS = predict(toe_indep, type = "response")
) |>
mutate(
.by = visit0,
avg_month = mean(month)
) |>
summarize(
.by = c(avg_month, treatment),
n_obs = n(),
prop_0 = sum(severe == 1)/n_obs,
pred_gee = mean(gee_MSS),
pred_logit = mean(logit_MSS)
) |>
mutate(
treatment = if_else(treatment == "I", "Itraconzaole", "Terbinafine")
) |>
ggplot(
mapping = aes(
x = avg_month,
y = prop_0
)
) +
facet_wrap(
facets = vars(treatment),
nrow = 2
) +
# Observed Proportions
geom_line(
mapping = aes(color="Observed")
) +
geom_point(
mapping = aes(color="Observed")
) +
# GEE Proportions with Unstructured Correlations
geom_line(
mapping = aes(
y = pred_gee,
color = "GEE"
),
linetype="dashed"
) +
geom_point(
mapping = aes(
y = pred_gee,
color = "GEE"
),
shape = 15
) +
# Logistic Proportions
geom_line(
mapping = aes(
y = pred_logit,
color = "Logistic"
),
linetype="longdash"
) +
geom_point(
mapping = aes(
y = pred_logit,
color = "Logistic"
),
shape = 17
) +
# Average month per visit on the x-axis
scale_x_continuous(
breaks = toe |>
summarize(.by = visit0, avg_month = mean(month)) |>
pull(avg_month) |>
round(),
minor_breaks = NULL
) +
labs(
x = "Months after Initival Visit",
y = "Proportion with Moderate or Severe Separation",
title = "Proportion of Patients with Moderate or Severe Separation",
subtitle = "By Month and Treatment"
) +
theme_bw() +
text_theme +
theme(legend.position="bottom") +
scale_colour_manual(
name = 'Probability Type',
values = c('Observed'='#F8766D',
'GEE'='#00BFC4',
"Logistic"="#C77CFF")
)