SET WORKING DIRECTORY AND LOAD LIBRARIES
setwd("D:/ACADEMIA/Research/Lucy")
rm(list=ls())
library(readr)
library(tidyverse)
data_wide <- read_csv("data.csv")
data_long <- read_csv("data_long.csv")
LIBRARIES AND THEIR APPLICATIONS
library(labelled) # labeling data
library(rstatix) # summary statistics
library(ggpubr) # convenient summary statistics and plots
library(GGally) # advanced plot
library(car) # useful for anova/wald test
library(Epi) # easy getting CI for model coef/pred
library(lme4) # linear mixed-effects models
library(lmerTest) # test for linear mixed-effects models
library(emmeans) # marginal means
library(multcomp) # CI for linear combinations of model coef
library(geepack) # generalized estimating equations
library(ggeffects) # marginal effects, adjusted predictions
library(gt) # nice tables
library(tidyverse) # for everything (data manipulation, visualization, coding, and more)
theme_set(theme_minimal() + theme(legend.position = "bottom")) # theme for ggplot
library(haven)
library(viridis)
library(hrbrthemes)
library(scales)
READ COLUMN NAMES
colnames(data_long)
## [1] "ID" "Day"
## [3] "StudyID" "site"
## [5] "weekstreatmentadherence" "sex"
## [7] "HIVStatus" "Semiqtltyrslt"
## [9] "Smoking" "diabeticstatus"
## [11] "BCGscar" "Ageyrs"
## [13] "HeightCMs" "Weightkgs"
## [15] "BMI" "agecat"
## [17] "BMIcat" "cough_Day"
df <- data_long
RECODE DATA STRUCTURE OR DATA TYPES
df$Day <- factor(df$Day, levels = 1:14, labels = c("Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6", "Day 7", "Day 8", "Day 9", "Day 10", "Day 11", "Day 12", "Day 13", "Day 14"))
df$site <- as.factor(df$site)
df$HIVStatus <- as.factor(df$HIVStatus)
df$sex <- as.factor(df$sex)
df$diabeticstatus <- as.factor(df$diabeticstatus)
df$Smoking <- as.factor(df$Smoking)
df$BCGscar <- as.factor(df$BCGscar)
df$Semiqtltyrslt <- as.factor(df$Semiqtltyrslt)
df$ID <-as.numeric(df$ID)
df$Ageyrs <-as.numeric(df$Ageyrs)
df$BMI <-as.numeric(df$BMI)
SUMMARIZE OUTCOME BY TIME PERIODS
group_by(df, Day) %>%
get_summary_stats(cough_Day)
## # A tibble: 14 × 14
## Day variable n min max median q1 q3 iqr mad mean sd
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Day 1 cough_Day 100 3 702 100 38 212. 174. 111. 149. 149.
## 2 Day 2 cough_Day 100 0 1438 154. 50 385 335 201. 244. 248.
## 3 Day 3 cough_Day 100 0 1212 152 29.2 389 360. 208. 242. 261.
## 4 Day 4 cough_Day 100 0 1696 112. 20 297. 277. 167. 203. 258.
## 5 Day 5 cough_Day 100 0 2351 70 6.75 186. 180. 104. 179. 326.
## 6 Day 6 cough_Day 100 0 1268 80 2 190. 188. 119. 156. 235.
## 7 Day 7 cough_Day 100 0 1181 64 1.75 200 198. 94.9 149. 221.
## 8 Day 8 cough_Day 100 0 1333 56.5 0 234. 234. 83.8 174. 273.
## 9 Day 9 cough_Day 100 0 1318 39 0 200 200 57.8 153. 239.
## 10 Day 10 cough_Day 100 0 1999 52.5 0 183 183 77.8 160. 276.
## 11 Day 11 cough_Day 100 0 895 50 5 190 185 74.1 148. 207.
## 12 Day 12 cough_Day 100 0 1045 40.5 1.5 215 214. 60.0 167. 243.
## 13 Day 13 cough_Day 100 0 1003 40 1.5 155. 153. 59.3 139. 214.
## 14 Day 14 cough_Day 100 0 1091 30 0.75 115 114. 44.5 106. 192.
## # ℹ 2 more variables: se <dbl>, ci <dbl>
LOOK AT A PAIRWISE CORRELATION MATRIX
#transform the data from long to wide format
wide_data <- df %>%
pivot_wider(names_from = Day, values_from = cough_Day)
head(wide_data)
## # A tibble: 6 × 30
## ID StudyID site weekstreatmentadherence sex HIVStatus Semiqtltyrslt
## <dbl> <chr> <fct> <chr> <fct> <fct> <fct>
## 1 1 R2D204007 Mulago Yes Male Negative High
## 2 2 R2D204004 Mulago Yes Male Negative High
## 3 3 R2D204008 Mulago Yes Male Negative Low
## 4 4 R2D204005 Mulago Yes Male Negative High
## 5 5 R2D204012 Mulago Yes Female Negative High
## 6 6 R2D204010 Mulago Yes Male Negative Medium
## # ℹ 23 more variables: Smoking <fct>, diabeticstatus <fct>, BCGscar <fct>,
## # Ageyrs <dbl>, HeightCMs <dbl>, Weightkgs <dbl>, BMI <dbl>, agecat <chr>,
## # BMIcat <chr>, `Day 1` <dbl>, `Day 2` <dbl>, `Day 3` <dbl>, `Day 4` <dbl>,
## # `Day 5` <dbl>, `Day 6` <dbl>, `Day 7` <dbl>, `Day 8` <dbl>, `Day 9` <dbl>,
## # `Day 10` <dbl>, `Day 11` <dbl>, `Day 12` <dbl>, `Day 13` <dbl>,
## # `Day 14` <dbl>
#plot of correlation matrix from Day 1 to Day 14 columns
correlation_matrix <- cor(wide_data[, 17:30])
correlation_matrix <- round(correlation_matrix, 2)
correlation_matrix
## Day 1 Day 2 Day 3 Day 4 Day 5 Day 6 Day 7 Day 8 Day 9 Day 10 Day 11
## Day 1 1.00 0.75 0.79 0.76 0.65 0.71 0.77 0.71 0.66 0.50 0.65
## Day 2 0.75 1.00 0.89 0.75 0.77 0.74 0.80 0.72 0.61 0.43 0.60
## Day 3 0.79 0.89 1.00 0.82 0.74 0.72 0.76 0.70 0.66 0.47 0.65
## Day 4 0.76 0.75 0.82 1.00 0.79 0.79 0.82 0.70 0.67 0.45 0.64
## Day 5 0.65 0.77 0.74 0.79 1.00 0.88 0.81 0.70 0.59 0.35 0.57
## Day 6 0.71 0.74 0.72 0.79 0.88 1.00 0.86 0.72 0.63 0.43 0.63
## Day 7 0.77 0.80 0.76 0.82 0.81 0.86 1.00 0.82 0.69 0.51 0.74
## Day 8 0.71 0.72 0.70 0.70 0.70 0.72 0.82 1.00 0.88 0.77 0.83
## Day 9 0.66 0.61 0.66 0.67 0.59 0.63 0.69 0.88 1.00 0.84 0.81
## Day 10 0.50 0.43 0.47 0.45 0.35 0.43 0.51 0.77 0.84 1.00 0.85
## Day 11 0.65 0.60 0.65 0.64 0.57 0.63 0.74 0.83 0.81 0.85 1.00
## Day 12 0.70 0.65 0.69 0.63 0.61 0.68 0.74 0.83 0.79 0.79 0.89
## Day 13 0.71 0.61 0.65 0.64 0.62 0.66 0.70 0.82 0.89 0.80 0.83
## Day 14 0.62 0.53 0.59 0.49 0.47 0.52 0.67 0.67 0.68 0.62 0.74
## Day 12 Day 13 Day 14
## Day 1 0.70 0.71 0.62
## Day 2 0.65 0.61 0.53
## Day 3 0.69 0.65 0.59
## Day 4 0.63 0.64 0.49
## Day 5 0.61 0.62 0.47
## Day 6 0.68 0.66 0.52
## Day 7 0.74 0.70 0.67
## Day 8 0.83 0.82 0.67
## Day 9 0.79 0.89 0.68
## Day 10 0.79 0.80 0.62
## Day 11 0.89 0.83 0.74
## Day 12 1.00 0.88 0.83
## Day 13 0.88 1.00 0.77
## Day 14 0.83 0.77 1.00
VISUALIZE THE CORRELATION MATRIX USING A HEATMAP
#plot of correlation matrix heat map from Day 1 to Day 14 columns
library(reshape2) # for melt function
correlation_matrix_melted <- melt(correlation_matrix)
p <- ggplot(correlation_matrix_melted, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_viridis() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right") +
labs(title = "Correlation Matrix Heatmap",
x = "Day",
y = "Day")
print(p)
VISUALIZE THE CORRELATION MATRIX USING A PAIRWISE SCATTERPLOT
ggpairs(wide_data, mapping = aes(), columns = 17:20,
lower = list(continuous = "smooth"))
CORRELATION BY SITE
ggpairs(wide_data, mapping = aes(color = site), columns = 17:20,
lower = list(continuous = "smooth"))
SUMMARY STATISTICS
#summarize cough_Day by mean, standard deviation, median, and IQR
df_summary <- df %>%
summarise(mean = mean(cough_Day),
sd = sd(cough_Day),
median = median(cough_Day),
q1 = quantile(cough_Day, 0.25),
q3 = quantile(cough_Day, 0.75))
print(df_summary)
## # A tibble: 1 × 5
## mean sd median q1 q3
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 169. 244. 70 6 240
PLOT INDIVIDUAL PROFILES OF COUGHS USING A SPAGHETTI PLOT
# Create the spaghetti plot
p <- ggplot(df, aes(x = Day, y = cough_Day, group = ID, color = as.factor(ID))) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Individual profiles of coughs",
x = "Time (Days)",
y = "Coughs") +
theme_update(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
print(p)
PLOT INDIVIDUAL PROFILES OF COUGHS BY STUDY SITE
#create a spaghetti plot by site
p <- ggplot(df, aes(x = Day, y = cough_Day, group = ID, color = site)) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Individual profiles of coughs by site",
x = "Time (Days)",
y = "Coughs") +
theme_update(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "bottom")
print(p)
VISUALIZE THE DISTRIBUTION OF COUGHS USING A HISTOGRAM
#HISTOGRAM - cough_Day
#Visualize the distribution of cough_Day (whether there is a violation of the multivariate normality assumption);
#It is difficult to visualize normality of a vector (multiple time points = multiple axes)
#We can look at the overall distribution (as if there is only one time point)
#if we see deviations from Normality here, there are likely deviations at the multivariate level;
#Note: number of 30 bins are selected automatically in this code
p2 <- ggplot(data=df, aes(x=cough_Day))
p2 + geom_histogram(color="black", fill="lightblue", bins=20) +
ggtitle("Distribution of Coughs")
#Add a kernel density line using geom_density and a Normal curve using stat_function
#Note: histogram binwidth is calculated directly from density function
#suggested bimodal distribution?
p2 + geom_histogram(aes(y=..density..), color="black", fill="lightblue", binwidth=density(df$cough_Day)$bw) +
geom_density( linetype="dashed") +
ggtitle("Distribution of coughs")+
stat_function(fun = dnorm, args = list(mean=mean(df$cough_Day), sd=sd(df$cough_Day)))
#OVERLAYED TRANSPARENT HISTOGRAM of cough_Day by site
#If the bimodal pattern persists by group, this is more likely to be a data artificial (ex. missing people in study), rather than true non-normality
p2 <- ggplot(data=df, aes(x=cough_Day, fill=site))
p2 + geom_histogram(color="black", alpha=0.5, position="identity", bins = 20) +
labs(title="Distribution of coughs by study site", x="coughs", fill="site")
p3 <- ggplot(data=df, aes(x=cough_Day))
p3 + geom_histogram(aes(y = stat(density), color=site, fill=site), alpha=0.5, position="identity", bins = 20) +
geom_density(aes(color = site), linetype="dashed") +
labs(title="Distribution of coughs by study site",
x="Coughs", color="site", fill="site")
VISUALIZE THE DISTRIBUTION OF COUGHS BY STUDY SITE USING A BOX PLOT
#BOX PLOT - cough_Day by site
p4 <- ggplot(data=df, aes(y=cough_Day, x=site))
p4 + geom_boxplot() +
labs(title="Distribution of coughs by site",
x="site", y="coughs")+stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red")
#R defines outliers that are more extreme than +/- 1.5(IQR)
#Note: the identification of outliers is SUBJECTIVE!!! Do not simply discard data outside the fence - know your subject matter.
#Check whether the potential outliers are actually biologically plausible or whether they are data entry errors!
VISUALIZE THE DISTRIBUTION OF COUGHS BY TIME PERIOD USING A BOX PLOT
#BOX PLOT - cough_Day by Day
p4 <- ggplot(data=df, aes(y=cough_Day, x=Day))
p4 + geom_boxplot() +
labs(title="Distribution of coughs by Time",
x="Time (Days)", y="coughs")+stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red")
#boxplot of cough_Day by Day and site
p <- ggplot(df, aes(x = Day, y = cough_Day, fill = site)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Coughs by Time and Site",
x = "Time (Days)",
y = "Coughs") +
theme(plot.title = element_text(hjust = 0.5))+stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="blue", fill="blue")
print(p)
VISUALIZE THE MEAN COUGHS BY TIME PERIOD USING A LINE PLOT
# Calculate the mean and standard error of cough_Day by Day
df_summary <- df %>%
group_by(Day) %>%
summarise(mean = mean(cough_Day),
se = sd(cough_Day) / sqrt(n()))
# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = mean, group = 1, color = Day)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
theme_minimal() +
labs(title = "Mean Coughs by Time",
x = "Time (Days)",
y = "Coughs") +
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
print(p)
VISUALIZE THE MEAN COUGHS BY TIME PERIOD AND STUDY SITE USING A LINE PLOT
# Calculate the mean and standard error of cough_Day by Day and site
df_summary <- df %>%
group_by(Day, site) %>%
summarise(mean = mean(cough_Day),
se = sd(cough_Day) / sqrt(n()),
.groups = 'drop') # Ensure summarise doesn't keep grouping
# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = mean, group = site, color = site)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
theme_minimal() +
labs(title = "Mean Coughs by Time and Site",
x = "Time (Days)",
y = "Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
VISUALIZE THE MEDIAN COUGHS BY TIME PERIOD USING A LINE PLOT
# Calculate the median and IQR of cough_Day by Day
df_summary <- df %>%
group_by(Day) %>%
summarise(median = median(cough_Day),
q1 = quantile(cough_Day, 0.25),
q3 = quantile(cough_Day, 0.75))
# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = median, group = 1, color = Day)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
theme_minimal() +
labs(title = "Median Coughs by Time",
x = "Time (Days)",
y = "Coughs") +
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
print(p)
VISUALIZE THE MEDIAN COUGHS BY TIME PERIOD AND STUDY SITE USING A LINE PLOT
# Calculate the median and IQR of cough_Day by Day and site
df_summary <- df %>%
group_by(Day, site) %>%
summarise(median = median(cough_Day),
q1 = quantile(cough_Day, 0.25),
q3 = quantile(cough_Day, 0.75),
.groups = 'drop') # Ensure summarise doesn't keep grouping
# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = median, group = site, color = site)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
theme_minimal() +
labs(title = "Median Coughs by Time and Site",
x = "Time (Days)",
y = "Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
VISUALIZE THE COUGHS BY AGE AND STUDY SITE USING A SCATTER PLOT
#scatter plot of cough_Day by Ageyrs and site
p <- ggplot(df, aes(x = Ageyrs, y = cough_Day, color = site)) +
geom_point() +
theme_minimal() +
labs(title = "Coughs by Age and Site",
x = "Age (Years)",
y = "Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
Lets try to transform the OUTCOME using natural log to see if we can get a better model fit. We shall include only finite values in the transformation.
#create a variable of log transformed cough_Day round to 2 decimal places include only finite values
df$log_cough_Day <- round(log(df$cough_Day), 2)
df3 <- df
df3 <- df3[is.finite(df$log_cough_Day), ]
#plot the histogram of log transformed cough_Day
p <- ggplot(data=df3, aes(x=log_cough_Day))
p + geom_histogram(color="black", fill="lightblue", bins=15) +
ggtitle("Distribution of log transformed coughs")
#Add a kernel density line using geom_density and a Normal curve using stat_function
p + geom_histogram(aes(y=..density..), color="black", fill="lightblue", binwidth=density(df3$log_cough_Day)$bw) +
geom_density( linetype="dashed") +
ggtitle("Distribution of log transformed coughs")+
stat_function(fun = dnorm, args = list(mean=mean(df3$log_cough_Day), sd=sd(df3$log_cough_Day)))
#summarize the log transformed cough_Day by mean, standard deviation, median, and IQR
df_summary <- df3 %>%
summarise(mean = mean(log_cough_Day),
sd = sd(log_cough_Day),
median = median(log_cough_Day),
q1 = quantile(log_cough_Day, 0.25),
q3 = quantile(log_cough_Day, 0.75))
print(df_summary)
## # A tibble: 1 × 5
## mean sd median q1 q3
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.48 1.56 4.7 3.53 5.7
#spaghetti plot of log transformed cough_Day
p <- ggplot(df3, aes(x = Day, y = log_cough_Day, group = ID, color = as.factor(ID))) +
geom_line() +
geom_point() +
theme_minimal() +
labs(title = "Individual profiles of log transformed coughs",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
print(p)
#Average trendline of log transformed cough_Day by Day using a loess
df4 <- df3
df4$Day <- as.numeric(df4$Day)
p <- ggplot(df4, aes(x = Day, y = log_cough_Day)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE, formula = y~x) +
theme_minimal() +
labs(title = "Average trendline of log transformed coughs by Time",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
#overlayed transparent histogram of log transformed cough_Day by site
p2 <- ggplot(data=df3, aes(x=log_cough_Day, fill=site))
p2 + geom_histogram(color="black", alpha=0.5, position="identity", bins = 15) +
labs(title="Distribution of log transformed coughs by study site", x="log coughs", fill="site")
p3 <- ggplot(data=df3, aes(x=log_cough_Day))
p3 + geom_histogram(aes(y = stat(density), color=site, fill=site), alpha=0.5, position="identity", bins = 15) +
geom_density(aes(color = site), linetype="dashed") +
labs(title="Distribution of log transformed coughs by study site",
x="Log Coughs", color="site", fill="site")
#boxplot of log transformed cough_Day by Day and site
p <- ggplot(df3, aes(x = Day, y = log_cough_Day, fill = site)) +
geom_boxplot() +
theme_minimal() +
labs(title = "Log Coughs by Time and Site",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
# Calculate the mean and standard error of log_cough_Day by Day
df_summary <- df3 %>%
group_by(Day) %>%
summarise(mean = mean(log_cough_Day),
se = sd(log_cough_Day) / sqrt(n()))
# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = mean, group = 1, color = Day)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
theme_minimal() +
labs(title = "Mean Log Coughs by Time",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
print(p)
# Calculate the mean and standard error of log_cough_Day by Day and site
df_summary <- df3 %>%
group_by(Day, site) %>%
summarise(mean = mean(log_cough_Day),
se = sd(log_cough_Day) / sqrt(n()),
.groups = 'drop') # Ensure summarise doesn't keep grouping
# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = mean, group = site, color = site)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
theme_minimal() +
labs(title = "Mean Log Coughs by Time and Site",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
# Calculate the median and IQR of log_cough_Day by Day
df_summary <- df3 %>%
group_by(Day) %>%
summarise(median = median(log_cough_Day),
q1 = quantile(log_cough_Day, 0.25),
q3 = quantile(log_cough_Day, 0.75))
# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = median, group = 1, color = Day)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
theme_minimal() +
labs(title = "Median Log Coughs by Time",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position = "none")
print(p)
# Calculate the median and IQR of log_cough_Day by Day and site
df_summary <- df3 %>%
group_by(Day, site) %>%
summarise(median = median(log_cough_Day),
q1 = quantile(log_cough_Day, 0.25),
q3 = quantile(log_cough_Day, 0.75),
.groups = 'drop') # Ensure summarise doesn't keep grouping
# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = median, group = site, color = site)) +
geom_line() +
geom_point() +
geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
theme_minimal() +
labs(title = "Median Log Coughs by Time and Site",
x = "Time (Days)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
#scatter plot of log_cough_Day by Ageyrs and site
p <- ggplot(df3, aes(x = Ageyrs, y = log_cough_Day, color = site)) +
geom_point( aes(shape = site)) +
theme_minimal() +
labs(title = "Log Coughs by Age and Site",
x = "Age (Years)",
y = "Log Coughs") +
theme(plot.title = element_text(hjust = 0.5))
print(p)
Reminding ourselves about the necessary libraries
library(lme4) # for linear mixed-effects models
library(lmerTest) # for p-values of linear mixed-effects models
library(glmmTMB) # for Poisson mixed-effects models
REGRESSION ANALYSIS
#convert Day to numeric
df$Day <- as.numeric(df$Day)
# Fit the GLM with Poisson distribution and log link, Day as continuous
poisson_model_cont <- glm(cough_Day ~ Day + site + ID,
family = poisson(link = "log"),
data = df)
# Check for overdispersion
dispersion <- sum(residuals(poisson_model_cont, type = "pearson")^2) / df.residual(poisson_model_cont)
print(paste("Dispersion parameter:", dispersion))
## [1] "Dispersion parameter: 287.10163875548"
# Summary of the model
summary_cont <- summary(poisson_model_cont)
print(summary_cont)
##
## Call:
## glm(formula = cough_Day ~ Day + site + ID, family = poisson(link = "log"),
## data = df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.074e+00 7.150e-03 709.64 <2e-16 ***
## Day -3.712e-02 5.134e-04 -72.31 <2e-16 ***
## siteMulago -3.578e-01 4.859e-03 -73.65 <2e-16 ***
## ID 8.573e-03 8.155e-05 105.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 380005 on 1399 degrees of freedom
## Residual deviance: 343954 on 1396 degrees of freedom
## AIC: 351119
##
## Number of Fisher Scoring iterations: 6
Plot the residuals for the continuous Day model
# Extract residuals
residuals_poisson_cont <- residuals(poisson_model_cont, type = "deviance")
# Plot residuals for continuous Day model
ggplot(data = df, aes(x = seq_along(residuals_poisson_cont), y = residuals_poisson_cont)) +
geom_point() +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residuals (GLM Poisson - Day as Continuous)",
x = "Index",
y = "Deviance Residuals") +
theme_minimal()
Lets fit a GLM negative binomial mixed effects model
# Install and load necessary library
#install.packages("MASS")
library(MASS)
#convert Day to numeric
df$Day <- as.numeric(df$Day)
# Fit the Negative Binomial model
nb_model_cont <- glm.nb(cough_Day ~ Day + site + ID, data = df)
# Summary of the model
summary(nb_model_cont)
##
## Call:
## glm.nb(formula = cough_Day ~ Day + site + ID, data = df, init.theta = 0.3320238524,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.139069 0.156147 32.912 < 2e-16 ***
## Day -0.048277 0.011519 -4.191 2.78e-05 ***
## siteMulago -0.324079 0.103386 -3.135 0.00172 **
## ID 0.008581 0.001790 4.793 1.64e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(0.332) family taken to be 1)
##
## Null deviance: 1773.6 on 1399 degrees of freedom
## Residual deviance: 1700.7 on 1396 degrees of freedom
## AIC: 15685
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 0.3320
## Std. Err.: 0.0118
##
## 2 x log-likelihood: -15675.1320
# Check dispersion
dispersion_nb <- sum(residuals(nb_model_cont, type = "pearson")^2) / df.residual(nb_model_cont)
print(paste("Dispersion parameter (Negative Binomial):", dispersion_nb))
## [1] "Dispersion parameter (Negative Binomial): 0.569725366742915"
plot the residuals for the negative binomial model
# Extract residuals
residuals_nb_cont <- residuals(nb_model_cont, type = "deviance")
# Plot residuals
ggplot(data = df, aes(x = seq_along(residuals_nb_cont), y = residuals_nb_cont)) +
geom_point() +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residuals (Negative Binomial - Day as Continuous)",
x = "Index",
y = "Deviance Residuals") +
theme_minimal()
Fixed Effects: Day: The effect of Day on cough_Day is consistent across all sites and individuals. Random Effects: (1 | site): The intercept can vary across sites. Each site has its own baseline level of cough_Day. (1 | ID): The intercept can vary across individuals. Each individual has their own baseline level of cough_Day.
This model allows for variability in both the baseline count of coughs (intercept) and the effect of Day (slope) across sites, but only variability in the intercept across individuals. Suitable if you believe both the baseline cough count and the rate of change (slope) with Day vary by site.
#lets convert Day to factor
df$Day <- as.factor(df$Day)
# Fit the Random-Intercept Poisson mixed-effects model
random_intercept_model <- glmer(cough_Day ~ Day + (1 | site) + (1 | ID), family = poisson(link = "log"), data = df)
## boundary (singular) fit: see help('isSingular')
# Summary of the model
summary(random_intercept_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: cough_Day ~ Day + (1 | site) + (1 | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 94487.6 94571.5 -47227.8 94455.6 1384
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -26.145 -4.197 -1.302 2.886 57.346
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 2.852 1.689
## site (Intercept) 0.000 0.000
## Number of obs: 1400, groups: ID, 100; site, 2
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.065127 0.169220 24.023 < 2e-16 ***
## Day2 0.493119 0.010406 47.387 < 2e-16 ***
## Day3 0.485492 0.010421 46.586 < 2e-16 ***
## Day4 0.310568 0.010794 28.772 < 2e-16 ***
## Day5 0.184326 0.011097 16.610 < 2e-16 ***
## Day6 0.044712 0.011468 3.899 9.67e-05 ***
## Day7 0.002954 0.011587 0.255 0.7987
## Day8 0.158780 0.011162 14.225 < 2e-16 ***
## Day9 0.025757 0.011522 2.235 0.0254 *
## Day10 0.070485 0.011397 6.185 6.22e-10 ***
## Day11 -0.007288 0.011617 -0.627 0.5304
## Day12 0.116774 0.011272 10.360 < 2e-16 ***
## Day13 -0.066230 0.011793 -5.616 1.95e-08 ***
## Day14 -0.340259 0.012717 -26.757 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE) or
## vcov(x) if you need it
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
extract the fixed effect coefficients
# Extract the fixed effect coefficients
fixed_effects <- fixef(random_intercept_model)
# Exponentiate the fixed effect coefficients
exp_fixed_effects <- exp(fixed_effects)
# Print the exponentiated coefficients
exp_fixed_effects
## (Intercept) Day2 Day3 Day4 Day5 Day6
## 58.2723048 1.6374161 1.6249749 1.3641995 1.2024080 1.0457267
## Day7 Day8 Day9 Day10 Day11 Day12
## 1.0029588 1.1720797 1.0260912 1.0730280 0.9927381 1.1238657
## Day13 Day14
## 0.9359157 0.7115861
# Plot residuals for the glmer model
plot(residuals(random_intercept_model, type = "pearson"), main = "Residuals (randon intercept)")
#add a smooth line to the plot
lines(lowess(residuals(random_intercept_model, type = "pearson")), col = "red")
Fixed Effects:
Day: The overall effect of Day on cough_Day. Random Effects:
(Day | site): Both the intercept and the slope of Day can vary across sites. Each site has its own baseline level of cough_Day and its own effect of Day. (1 | ID): The intercept can vary across individuals. Each individual has their own baseline level of cough_Day.
This model allows for variability in the baseline count of coughs (intercept) across sites and variability in both the baseline count of coughs (intercept) and the effect of Day (slope) across individuals.
# Fit the Random-Coefficient Poisson mixed-effects model
random_coefficient_model <- glmer(cough_Day ~ Day + (Day | site) + (1| ID), family = poisson(link = "log"), data = df)
# Summary of the model
summary(random_coefficient_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: cough_Day ~ Day + (Day | site) + (1 | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 92816.4 93445.7 -46288.2 92576.4 1280
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -27.628 -3.982 -1.276 2.569 53.009
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 2.80583 1.6751
## site (Intercept) 0.03509 0.1873
## Day2 0.04096 0.2024 0.82
## Day3 0.02790 0.1670 0.81 1.00
## Day4 0.03307 0.1818 0.39 0.63 0.65
## Day5 0.41042 0.6406 0.45 0.64 0.65 0.83
## Day6 0.21812 0.4670 0.36 0.56 0.57 0.87 0.99
## Day7 0.30276 0.5502 0.55 0.78 0.79 0.90 0.78 0.79
## Day8 0.49362 0.7026 0.33 0.49 0.50 0.71 0.84 0.84 0.82
## Day9 0.42300 0.6504 0.35 0.49 0.50 0.65 0.79 0.78 0.81 0.99
## Day10 0.65959 0.8122 0.39 0.54 0.55 0.67 0.78 0.77 0.85 0.99
## Day11 0.99829 0.9991 0.27 0.37 0.38 0.49 0.63 0.62 0.57 0.75
## Day12 0.65517 0.8094 0.27 0.40 0.41 0.56 0.65 0.65 0.67 0.83
## Day13 0.93183 0.9653 0.31 0.42 0.42 0.53 0.69 0.67 0.61 0.80
## Day14 0.92247 0.9605 0.32 0.50 0.51 0.74 0.74 0.76 0.80 0.86
##
##
##
##
##
##
##
##
##
##
##
## 1.00
## 0.76 0.75
## 0.83 0.83 0.99
## 0.80 0.79 1.00 0.99
## 0.84 0.85 0.93 0.97 0.94
## Number of obs: 1400, groups: ID, 100; site, 2
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.072425 0.213842 19.044 < 2e-16 ***
## Day2 0.471765 0.143499 3.288 0.00101 **
## Day3 0.469919 0.118577 3.963 7.4e-05 ***
## Day4 0.318058 0.129041 2.465 0.01371 *
## Day5 0.145743 0.453146 0.322 0.74774
## Day6 0.036622 0.330448 0.111 0.91175
## Day7 -0.008039 0.389252 -0.021 0.98352
## Day8 0.109320 0.496927 0.220 0.82588
## Day9 -0.034802 0.460038 -0.076 0.93970
## Day10 -0.013420 0.574401 -0.023 0.98136
## Day11 -0.089970 0.706604 -0.127 0.89868
## Day12 0.069217 0.572467 0.121 0.90376
## Day13 -0.159286 0.682688 -0.233 0.81551
## Day14 -0.356361 0.679267 -0.525 0.59984
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## optimizer (Nelder_Mead) convergence code: 4 (failure to converge in 10000 evaluations)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate Hessian with 24 negative eigenvalues
## failure to converge in 10000 evaluations
# Extract and exponentiate the fixed effect coefficients
fixed_effects <- fixef(random_coefficient_model)
exp_fixed_effects <- exp(fixed_effects)
# Print the exponentiated coefficients
exp_fixed_effects
## (Intercept) Day2 Day3 Day4 Day5 Day6
## 58.6991195 1.6028207 1.5998648 1.3744554 1.1568992 1.0373009
## Day7 Day8 Day9 Day10 Day11 Day12
## 0.9919928 1.1155190 0.9657963 0.9866700 0.9139586 1.0716686
## Day13 Day14
## 0.8527528 0.7002199
# Plot residuals for the glmer model
plot(residuals(random_coefficient_model, type = "pearson"), main = "Residuals random coeff")
#add a smooth line to the plot
lines(lowess(residuals(random_coefficient_model, type = "pearson")), col = "red")
Day: The overall effect of Day on cough_Day. Random Effects:
(1 | site): The intercept can vary across sites. Each site has its own baseline level of cough_Day. (Day | ID): Both the intercept and the slope of Day can vary across individuals. Each individual has their own baseline level of cough_Day and their own effect of Day.
This model allows for variability in the baseline count of coughs (intercept) across sites and variability in both the baseline count of coughs (intercept) and the effect of Day (slope) across individuals. Suitable if you believe the baseline cough count varies by site and the baseline cough count and rate of change with Day vary by individual.
#load library
library(glmmTMB)
# Fit the Random-Coefficients Poisson mixed-effects model using glmmTMB
poisson_random_coefficients_model_tmb_ID <- glmmTMB(cough_Day ~ Day + (1 | site) + (Day | ID),
family = poisson,
data = df)
# Generate Pearson residuals
residuals_tmb_ID <- residuals(poisson_random_coefficients_model_tmb_ID, type = "pearson")
# Create residual plot
plot(residuals_tmb_ID, main = "Residuals (glmmTMB Random Coefficients Model)",
ylab = "Pearson Residuals", xlab = "Index", pch = 16, cex = 0.6)
abline(h = 0, col = "red")
#compare the models
anova(random_intercept_model, random_coefficient_model, test = "Chisq")
## Data: df
## Models:
## random_intercept_model: cough_Day ~ Day + (1 | site) + (1 | ID)
## random_coefficient_model: cough_Day ~ Day + (Day | site) + (1 | ID)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## random_intercept_model 16 94488 94571 -47228 94456
## random_coefficient_model 120 92816 93446 -46288 92576 1879.2 104 < 2.2e-16
##
## random_intercept_model
## random_coefficient_model ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the number of coughs varies between individuals, Day, and site, the Poisson Random Coefficients Model using glmmTMB appears to be the most appropriate choice. This model will account for variability in the baseline count of coughs across sites and both the baseline count and the effect of Day across individuals, providing a more flexible and accurate representation of our data.
#compare AIC and BIC
AIC(random_intercept_model, random_coefficient_model, poisson_random_coefficients_model_tmb_ID)
## df AIC
## random_intercept_model 16 94487.56
## random_coefficient_model 120 92816.37
## poisson_random_coefficients_model_tmb_ID 120 14019.78
BIC(random_intercept_model, random_coefficient_model, poisson_random_coefficients_model_tmb_ID)
## df BIC
## random_intercept_model 16 94571.47
## random_coefficient_model 120 93445.68
## poisson_random_coefficients_model_tmb_ID 120 14649.09
Compare GEE and GLMM
GEE Models fixed effects like Day but assumes the effect is consistent across all observations GLMM Models fixed effects like Day but allows the effect to vary across groups or subjects (random effects)
# Load necessary libraries
library(broom.mixed)
# Fit the glmmTMB model
glmmTMB_model <- glmmTMB(
cough_Day ~ Day +
(1 | site) + (Day | ID),
family = poisson,
data = df,
control = glmmTMBControl(optCtrl = list(iter.max = 10000, eval.max = 10000))
)
# Create a combined clustering variable
df <- df %>%
mutate(cluster = interaction(site, ID, drop = TRUE))
# Fit the GEE model with the combined clustering variable
gee_model <- geeglm(
cough_Day ~ Day ,
family = poisson,
data = df,
id = cluster, # Combined clustering variable
corstr = "exchangeable" # Correlation structure
)
# Extract and print fixed effects for glmmTMB
glmmTMB_fixed_effects <- broom.mixed::tidy(glmmTMB_model, effects = "fixed") %>%
mutate(p.value = round(p.value, 3))
print(glmmTMB_fixed_effects)
## # A tibble: 14 × 7
## effect component term estimate std.error statistic p.value
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 fixed cond (Intercept) 4.45 0.148 30.1 0
## 2 fixed cond Day2 0.217 0.114 1.90 0.057
## 3 fixed cond Day3 -0.0165 0.142 -0.117 0.907
## 4 fixed cond Day4 -0.552 0.203 -2.72 0.007
## 5 fixed cond Day5 -1.08 0.231 -4.68 0
## 6 fixed cond Day6 -1.52 0.301 -5.05 0
## 7 fixed cond Day7 -1.74 0.318 -5.48 0
## 8 fixed cond Day8 -1.66 0.307 -5.43 0
## 9 fixed cond Day9 -2.47 0.418 -5.91 0
## 10 fixed cond Day10 -1.97 0.344 -5.72 0
## 11 fixed cond Day11 -1.58 0.292 -5.41 0
## 12 fixed cond Day12 -1.90 0.355 -5.34 0
## 13 fixed cond Day13 -2.03 0.349 -5.82 0
## 14 fixed cond Day14 -2.29 0.319 -7.17 0
# Extract and print coefficients for GEE model
gee_fixed_effects <- broom::tidy(gee_model) %>%
mutate(p.value = round(p.value, 3))
print(gee_fixed_effects)
## # A tibble: 14 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 5.00 0.0997 2517. 0
## 2 Day2 0.493 0.0718 47.2 0
## 3 Day3 0.485 0.0681 50.8 0
## 4 Day4 0.311 0.0819 14.4 0
## 5 Day5 0.184 0.139 1.75 0.186
## 6 Day6 0.0447 0.106 0.178 0.673
## 7 Day7 0.00295 0.0950 0.000967 0.975
## 8 Day8 0.159 0.111 2.06 0.151
## 9 Day9 0.0258 0.117 0.0487 0.825
## 10 Day10 0.0705 0.150 0.221 0.638
## 11 Day11 -0.00729 0.107 0.00466 0.946
## 12 Day12 0.117 0.103 1.29 0.256
## 13 Day13 -0.0662 0.107 0.382 0.537
## 14 Day14 -0.340 0.142 5.77 0.016
# Summary for glmmTMB model
summary(glmmTMB_model)
## Family: poisson ( log )
## Formula: cough_Day ~ Day + (1 | site) + (Day | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 14019.8 14649.1 -6889.9 13779.8 1280
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## site (Intercept) 0.01603 0.1266
## ID (Intercept) 1.35150 1.1625
## Day2 1.22333 1.1060 0.12
## Day3 1.88846 1.3742 0.25 0.81
## Day4 3.74011 1.9339 0.24 0.43 0.72
## Day5 4.70845 2.1699 0.31 0.39 0.56 0.74
## Day6 7.63363 2.7629 0.28 0.34 0.46 0.63 0.86
## Day7 8.38341 2.8954 0.40 0.35 0.49 0.54 0.76 0.87
## Day8 7.77778 2.7889 0.48 0.39 0.48 0.46 0.67 0.74 0.94
## Day9 14.29932 3.7814 0.45 0.43 0.51 0.53 0.71 0.80 0.94 0.95
## Day10 9.68098 3.1114 0.43 0.42 0.48 0.48 0.58 0.69 0.85 0.88
## Day11 7.09321 2.6633 0.35 0.32 0.38 0.43 0.55 0.71 0.85 0.87
## Day12 10.48209 3.2376 0.40 0.42 0.46 0.43 0.59 0.73 0.85 0.87
## Day13 9.78080 3.1274 0.36 0.46 0.46 0.42 0.63 0.73 0.83 0.85
## Day14 7.98886 2.8265 0.45 0.44 0.51 0.39 0.52 0.65 0.82 0.85
##
##
##
##
##
##
##
##
##
##
##
## 0.95
## 0.89 0.94
## 0.92 0.97 0.96
## 0.91 0.92 0.91 0.95
## 0.89 0.92 0.89 0.93 0.92
## Number of obs: 1400, groups: site, 2; ID, 100
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.44848 0.14756 30.147 < 2e-16 ***
## Day2 0.21730 0.11433 1.901 0.05735 .
## Day3 -0.01655 0.14198 -0.117 0.90722
## Day4 -0.55179 0.20298 -2.718 0.00656 **
## Day5 -1.08119 0.23105 -4.679 2.88e-06 ***
## Day6 -1.51704 0.30065 -5.046 4.51e-07 ***
## Day7 -1.74198 0.31810 -5.476 4.35e-08 ***
## Day8 -1.66457 0.30664 -5.428 5.68e-08 ***
## Day9 -2.46822 0.41794 -5.906 3.51e-09 ***
## Day10 -1.96627 0.34392 -5.717 1.08e-08 ***
## Day11 -1.58190 0.29228 -5.412 6.22e-08 ***
## Day12 -1.89681 0.35549 -5.336 9.52e-08 ***
## Day13 -2.02765 0.34867 -5.815 6.05e-09 ***
## Day14 -2.28655 0.31891 -7.170 7.50e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Summary for GEE model
summary(gee_model)
##
## Call:
## geeglm(formula = cough_Day ~ Day, family = poisson, data = df,
## id = cluster, corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) 5.001998 0.099697 2517.253 < 2e-16 ***
## Day2 0.493119 0.071781 47.194 6.43e-12 ***
## Day3 0.485492 0.068125 50.787 1.03e-12 ***
## Day4 0.310567 0.081854 14.396 0.000148 ***
## Day5 0.184326 0.139234 1.753 0.185552
## Day6 0.044712 0.105929 0.178 0.672957
## Day7 0.002954 0.095011 0.001 0.975193
## Day8 0.158780 0.110667 2.058 0.151359
## Day9 0.025756 0.116665 0.049 0.825270
## Day10 0.070485 0.149976 0.221 0.638375
## Day11 -0.007289 0.106799 0.005 0.945587
## Day12 0.116774 0.102852 1.289 0.256226
## Day13 -0.066230 0.107227 0.382 0.536799
## Day14 -0.340259 0.141625 5.772 0.016282 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = exchangeable
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 345.6 74.1
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.6792 0.09026
## Number of clusters: 100 Maximum cluster size: 14
# AIC and BIC for glmmTMB model
aic_glmmTMB <- AIC(glmmTMB_model)
bic_glmmTMB <- BIC(glmmTMB_model)
cat("AIC for glmmTMB model:", aic_glmmTMB, "\n")
## AIC for glmmTMB model: 14020
cat("BIC for glmmTMB model:", bic_glmmTMB, "\n")
## BIC for glmmTMB model: 14649
glmmTB residual plot
# Plot residuals for glmmTMB model
glmmTMB_residuals <- residuals(glmmTMB_model, type = "pearson")
ggplot(data = df, aes(x = seq_along(glmmTMB_residuals), y = glmmTMB_residuals)) +
geom_point() +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residuals (glmmTMB Model)", x = "Index", y = "Pearson Residuals") +
theme_minimal()
GEE residual plot
# Plot residuals for GEE model
gee_residuals <- residuals(gee_model, type = "pearson")
ggplot(data = df, aes(x = seq_along(gee_residuals), y = gee_residuals)) +
geom_point() +
geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
labs(title = "Residuals (GEE Model)", x = "Index", y = "Pearson Residuals") +
theme_minimal()
categorical versus continuous day
Continuous Day Model Linear Relationship: If there is a linear or approximately linear relationship between the variable and the outcome, it is appropriate to treat the variable as continuous. Example: If Day represents time and you expect the number of coughs to increase or decrease linearly over time.
Categorical Day Model Non-linear Relationships: If the relationship between the variable and the outcome is non-linear or you expect different effects at different levels, it is appropriate to treat the variable as categorical. Example: If the effect of Day on the number of coughs varies significantly at different times (e.g., different trends in different periods)
df$Day <- as.numeric(df$Day)
# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day +(1 | site) + (Day | ID),
family = poisson,
data = df)
summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
## Family: poisson ( log )
## Formula: cough_Day ~ Day + (1 | site) + (Day | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 71215 71247 -35602 71203 1394
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## site (Intercept) 0.0316 0.178
## ID (Intercept) 1.3004 1.140
## Day 0.0827 0.288 0.25
## Number of obs: 1400, groups: site, 2; ID, 100
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.0623 0.1703 29.73 < 2e-16 ***
## Day -0.1960 0.0294 -6.66 2.8e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#categorize BMI into underweight, normal, overweight, and obese
df$BMI_category <- cut(df$BMI, breaks = c(0, 18.5, 25, 30, Inf), labels = c("Underweight", "Normal", "Overweight", "Obese"))
#categorise Ageyrs into children, young adults, adults, and elderly
df$Age_category <- cut(df$Ageyrs, breaks = c(18, 40, 60, Inf), labels = c( "Young Adults", "Adults", "Elderly"))
table(df$Age_category)
##
## Young Adults Adults Elderly
## 1078 294 28
Bivariate analysis
Include all predictors for stepwise regression by backward elimination
df$Day <- as.numeric(df$Day)
# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + Ageyrs+ diabeticstatus + HIVStatus+Semiqtltyrslt + BCGscar + sex+Smoking+BMI+ (1 | site) + (Day | ID),
family = poisson,
data = df)
summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
## Family: poisson ( log )
## Formula:
## cough_Day ~ Day + Ageyrs + diabeticstatus + HIVStatus + Semiqtltyrslt +
## BCGscar + sex + Smoking + BMI + (1 | site) + (Day | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 71215 71305 -35591 71181 1383
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## site (Intercept) 0.0668 0.259
## ID (Intercept) 1.0516 1.025
## Day 0.0804 0.284 0.28
## Number of obs: 1400, groups: site, 2; ID, 100
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.8573 0.7416 3.85 0.00012 ***
## Day -0.1947 0.0290 -6.71 1.9e-11 ***
## Ageyrs -0.0011 0.0105 -0.11 0.91592
## diabeticstatusYes 0.9957 0.7884 1.26 0.20661
## HIVStatusPositive -0.3612 0.2821 -1.28 0.20046
## SemiqtltyrsltLow -0.1080 0.2920 -0.37 0.71143
## SemiqtltyrsltMedium 0.5013 0.2618 1.91 0.05555 .
## SemiqtltyrsltTrace -0.2231 0.4347 -0.51 0.60770
## SemiqtltyrsltVery Low -0.1744 0.3724 -0.47 0.63948
## BCGscarYes 0.3003 0.2572 1.17 0.24292
## sexMale 0.7383 0.2281 3.24 0.00121 **
## SmokingYes 0.3578 0.3589 1.00 0.31879
## BMI 0.0754 0.0296 2.55 0.01089 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Round the coqefficient estimates to 2 decimal places and p-values to 3 decimal places
# Load necessary libraries
library(broom.mixed)
library(dplyr)
# Fit the initial full model
full_model <- glmmTMB(
cough_Day ~ Day + Semiqtltyrslt + diabeticstatus +
(1 | site) + (Day | ID),
family = poisson,
data = df,
control = glmmTMBControl(optCtrl = list(iter.max = 10000, eval.max = 10000))
)
# Extract the fixed effects and p-values
tidy_full_model <- tidy(full_model, effects = "fixed") %>%
mutate(p.value = round(p.value, 3))
# Compute confidence intervals
conf_int <- confint(full_model, parm = "beta_", level = 0.95)
conf_int <- as.data.frame(conf_int)
conf_int <- conf_int %>%
rownames_to_column(var = "term") %>%
rename(conf.low = `2.5 %`, conf.high = `97.5 %`)
# Join the confidence intervals with the model output
tidy_full_model <- left_join(tidy_full_model, conf_int, by = "term") %>%
mutate(
estimate = round(estimate, 2),
std.error = round(std.error, 2),
conf.low = round(conf.low, 2),
conf.high = round(conf.high, 2),
exp.estimate = round(exp(estimate), 2),
exp.conf.low = round(exp(conf.low), 2),
exp.conf.high = round(exp(conf.high), 2)
)
# Select and rename columns for clarity
tidy_full_model <- tidy_full_model %>%
dplyr::select(term, estimate, std.error, p.value, conf.low, conf.high, exp.estimate, exp.conf.low, exp.conf.high) %>%
dplyr::rename(
"Term" = term,
"Estimate" = estimate,
"Std. Error" = std.error,
"P-value" = p.value,
"CI Lower" = conf.low,
"CI Upper" = conf.high,
"Exp(Estimate)" = exp.estimate,
"Exp(CI Lower)" = exp.conf.low,
"Exp(CI Upper)" = exp.conf.high
)
# Print the model coefficients, p-values, and exponentiated estimates with confidence intervals
print(tidy_full_model)
## # A tibble: 7 × 9
## Term Estimate `Std. Error` `P-value` `CI Lower` `CI Upper` `Exp(Estimate)`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Interc… 5.02 0.24 0 4.56 5.48 151.
## 2 Day -0.2 0.03 0 -0.25 -0.14 0.82
## 3 Semiqtl… -0.18 0.31 0.559 -0.78 0.42 0.84
## 4 Semiqtl… 0.34 0.28 0.223 -0.2 0.88 1.4
## 5 Semiqtl… -0.37 0.45 0.416 -1.25 0.52 0.69
## 6 Semiqtl… -0.1 0.39 0.805 -0.87 0.67 0.9
## 7 diabeti… 1.28 0.81 0.112 -0.3 2.86 3.6
## # ℹ 2 more variables: `Exp(CI Lower)` <dbl>, `Exp(CI Upper)` <dbl>
# Fit the initial full model
full_model <- glmmTMB(
cough_Day ~ Day + Semiqtltyrslt + diabeticstatus +
(1 | site) + (Day | ID),
family = poisson,
data = df,
control = glmmTMBControl(optCtrl = list(iter.max = 10000, eval.max = 10000))
)
# Extract the fixed effects and p-values
tidy_full_model <- tidy(full_model, effects = "fixed") %>%
mutate(p.value = round(p.value, 3))
# Compute confidence intervals
conf_int <- confint(full_model, parm = "beta_", level = 0.95)
conf_int <- as.data.frame(conf_int)
conf_int <- conf_int %>%
rownames_to_column(var = "term") %>%
rename(conf.low = `2.5 %`, conf.high = `97.5 %`)
# Join the confidence intervals with the model output
tidy_full_model <- left_join(tidy_full_model, conf_int, by = "term") %>%
mutate(
exp.estimate = round(exp(estimate), 2),
exp.conf.low = round(exp(conf.low), 2),
exp.conf.high = round(exp(conf.high), 2),
estimate = round(estimate, 2),
std.error = round(std.error, 2),
conf.low = round(conf.low, 2),
conf.high = round(conf.high, 2)
)
# Select and rename columns for clarity
tidy_full_model <- tidy_full_model %>%
dplyr::select(term, estimate, std.error, p.value, conf.low, conf.high, exp.estimate, exp.conf.low, exp.conf.high) %>%
dplyr::rename(
"Term" = term,
"Estimate" = estimate,
"Std. Error" = std.error,
"P-value" = p.value,
"CI Lower" = conf.low,
"CI Upper" = conf.high,
"Exp(Estimate)" = exp.estimate,
"Exp(CI Lower)" = exp.conf.low,
"Exp(CI Upper)" = exp.conf.high
)
# Print the model coefficients, p-values, and exponentiated estimates with confidence intervals
print(tidy_full_model)
## # A tibble: 7 × 9
## Term Estimate `Std. Error` `P-value` `CI Lower` `CI Upper` `Exp(Estimate)`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Interc… 5.02 0.24 0 4.56 5.48 152.
## 2 Day -0.2 0.03 0 -0.25 -0.14 0.82
## 3 Semiqtl… -0.18 0.31 0.559 -0.78 0.42 0.84
## 4 Semiqtl… 0.34 0.28 0.223 -0.2 0.88 1.4
## 5 Semiqtl… -0.37 0.45 0.416 -1.25 0.52 0.69
## 6 Semiqtl… -0.1 0.39 0.805 -0.87 0.67 0.91
## 7 diabeti… 1.28 0.81 0.112 -0.3 2.86 3.6
## # ℹ 2 more variables: `Exp(CI Lower)` <dbl>, `Exp(CI Upper)` <dbl>
Time X Site interaction with no covariates
# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + site + (1 | site) + (Day | ID),
family = poisson,
data = df)
summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
## Family: poisson ( log )
## Formula: cough_Day ~ Day + site + (1 | site) + (Day | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 71213 71250 -35600 71199 1393
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## site (Intercept) 1.00e-07 0.000316
## ID (Intercept) 1.28e+00 1.129272
## Day 8.35e-02 0.289020 0.26
## Number of obs: 1400, groups: site, 2; ID, 100
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.2999 0.1575 33.7 < 2e-16 ***
## Day -0.1965 0.0296 -6.6 3.1e-11 ***
## siteMulago -0.4773 0.2209 -2.2 0.031 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Time X Site interaction with covariates
# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + Day*site + sex*Day + diabeticstatus*Day + BMI*Day + (1 | site) + (Day | ID),
family = poisson,
data = df)
summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
## Family: poisson ( log )
## Formula:
## cough_Day ~ Day + Day * site + sex * Day + diabeticstatus * Day +
## BMI * Day + (1 | site) + (Day | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 71211 71284 -35591 71183 1386
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## site (Intercept) 5.87e-08 0.000242
## ID (Intercept) 1.18e+00 1.084355
## Day 7.41e-02 0.272180 0.31
## Number of obs: 1400, groups: site, 2; ID, 100
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.74615 0.69776 5.37 7.9e-08 ***
## Day -0.15397 0.17734 -0.87 0.385
## siteMulago -0.36753 0.22177 -1.66 0.097 .
## sexMale 0.66506 0.23669 2.81 0.005 **
## diabeticstatusYes 0.85576 0.79056 1.08 0.279
## BMI 0.05348 0.03019 1.77 0.076 .
## Day:siteMulago 0.03265 0.05627 0.58 0.562
## Day:sexMale 0.02387 0.06008 0.40 0.691
## Day:diabeticstatusYes -0.38670 0.19910 -1.94 0.052 .
## Day:BMI -0.00293 0.00763 -0.38 0.701
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Remove site interaction
# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + sex*Day + diabeticstatus*Day + BMI*Day + (1 | site) + (Day | ID),
family = poisson,
data = df)
summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
## Family: poisson ( log )
## Formula:
## cough_Day ~ Day + sex * Day + diabeticstatus * Day + BMI * Day +
## (1 | site) + (Day | ID)
## Data: df
##
## AIC BIC logLik deviance df.resid
## 71210 71273 -35593 71186 1388
##
## Random effects:
##
## Conditional model:
## Groups Name Variance Std.Dev. Corr
## site (Intercept) 0.0167 0.129
## ID (Intercept) 1.2026 1.097
## Day 0.0729 0.270 0.30
## Number of obs: 1400, groups: site, 2; ID, 100
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.58205 0.70749 5.06 4.1e-07 ***
## Day -0.13779 0.17442 -0.79 0.4295
## sexMale 0.67903 0.23960 2.83 0.0046 **
## diabeticstatusYes 0.96181 0.80413 1.20 0.2317
## BMI 0.05207 0.03055 1.70 0.0883 .
## Day:sexMale 0.02091 0.05947 0.35 0.7251
## Day:diabeticstatusYes -0.40451 0.19521 -2.07 0.0383 *
## Day:BMI -0.00280 0.00756 -0.37 0.7115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1