SET WORKING DIRECTORY AND LOAD LIBRARIES

setwd("D:/ACADEMIA/Research/Lucy")
rm(list=ls())
library(readr)
library(tidyverse)
data_wide <- read_csv("data.csv")
data_long <- read_csv("data_long.csv")

LIBRARIES AND THEIR APPLICATIONS

library(labelled)   # labeling data
library(rstatix)    # summary statistics
library(ggpubr)     # convenient summary statistics and plots
library(GGally)     # advanced plot
library(car)        # useful for anova/wald test
library(Epi)        # easy getting CI for model coef/pred
library(lme4)       # linear mixed-effects models
library(lmerTest)   # test for linear mixed-effects models
library(emmeans)    # marginal means
library(multcomp)   # CI for linear combinations of model coef
library(geepack)    # generalized estimating equations
library(ggeffects)  # marginal effects, adjusted predictions
library(gt)         # nice tables
library(tidyverse)  # for everything (data manipulation, visualization, coding, and more)
theme_set(theme_minimal() + theme(legend.position = "bottom")) # theme for ggplot

library(haven)
library(viridis)
library(hrbrthemes)
library(scales)

READ COLUMN NAMES

colnames(data_long)
##  [1] "ID"                      "Day"                    
##  [3] "StudyID"                 "site"                   
##  [5] "weekstreatmentadherence" "sex"                    
##  [7] "HIVStatus"               "Semiqtltyrslt"          
##  [9] "Smoking"                 "diabeticstatus"         
## [11] "BCGscar"                 "Ageyrs"                 
## [13] "HeightCMs"               "Weightkgs"              
## [15] "BMI"                     "agecat"                 
## [17] "BMIcat"                  "cough_Day"
df <- data_long

RECODE DATA STRUCTURE OR DATA TYPES

df$Day <- factor(df$Day, levels = 1:14, labels = c("Day 1", "Day 2", "Day 3", "Day 4", "Day 5", "Day 6", "Day 7", "Day 8", "Day 9", "Day 10", "Day 11", "Day 12", "Day 13", "Day 14"))

df$site <- as.factor(df$site)
df$HIVStatus <- as.factor(df$HIVStatus)
df$sex <- as.factor(df$sex)
df$diabeticstatus <- as.factor(df$diabeticstatus)
df$Smoking <- as.factor(df$Smoking)
df$BCGscar <- as.factor(df$BCGscar)
df$Semiqtltyrslt <- as.factor(df$Semiqtltyrslt)

df$ID <-as.numeric(df$ID)
df$Ageyrs <-as.numeric(df$Ageyrs)
df$BMI <-as.numeric(df$BMI)

SUMMARIZE OUTCOME BY TIME PERIODS

group_by(df, Day) %>% 
  get_summary_stats(cough_Day)
## # A tibble: 14 × 14
##    Day    variable      n   min   max median    q1    q3   iqr   mad  mean    sd
##    <fct>  <fct>     <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 Day 1  cough_Day   100     3   702  100   38     212.  174. 111.   149.  149.
##  2 Day 2  cough_Day   100     0  1438  154.  50     385   335  201.   244.  248.
##  3 Day 3  cough_Day   100     0  1212  152   29.2   389   360. 208.   242.  261.
##  4 Day 4  cough_Day   100     0  1696  112.  20     297.  277. 167.   203.  258.
##  5 Day 5  cough_Day   100     0  2351   70    6.75  186.  180. 104.   179.  326.
##  6 Day 6  cough_Day   100     0  1268   80    2     190.  188. 119.   156.  235.
##  7 Day 7  cough_Day   100     0  1181   64    1.75  200   198.  94.9  149.  221.
##  8 Day 8  cough_Day   100     0  1333   56.5  0     234.  234.  83.8  174.  273.
##  9 Day 9  cough_Day   100     0  1318   39    0     200   200   57.8  153.  239.
## 10 Day 10 cough_Day   100     0  1999   52.5  0     183   183   77.8  160.  276.
## 11 Day 11 cough_Day   100     0   895   50    5     190   185   74.1  148.  207.
## 12 Day 12 cough_Day   100     0  1045   40.5  1.5   215   214.  60.0  167.  243.
## 13 Day 13 cough_Day   100     0  1003   40    1.5   155.  153.  59.3  139.  214.
## 14 Day 14 cough_Day   100     0  1091   30    0.75  115   114.  44.5  106.  192.
## # ℹ 2 more variables: se <dbl>, ci <dbl>

LOOK AT A PAIRWISE CORRELATION MATRIX

#transform the data from long to wide format
wide_data <- df %>%
  pivot_wider(names_from = Day, values_from = cough_Day)

head(wide_data)
## # A tibble: 6 × 30
##      ID StudyID   site   weekstreatmentadherence sex    HIVStatus Semiqtltyrslt
##   <dbl> <chr>     <fct>  <chr>                   <fct>  <fct>     <fct>        
## 1     1 R2D204007 Mulago Yes                     Male   Negative  High         
## 2     2 R2D204004 Mulago Yes                     Male   Negative  High         
## 3     3 R2D204008 Mulago Yes                     Male   Negative  Low          
## 4     4 R2D204005 Mulago Yes                     Male   Negative  High         
## 5     5 R2D204012 Mulago Yes                     Female Negative  High         
## 6     6 R2D204010 Mulago Yes                     Male   Negative  Medium       
## # ℹ 23 more variables: Smoking <fct>, diabeticstatus <fct>, BCGscar <fct>,
## #   Ageyrs <dbl>, HeightCMs <dbl>, Weightkgs <dbl>, BMI <dbl>, agecat <chr>,
## #   BMIcat <chr>, `Day 1` <dbl>, `Day 2` <dbl>, `Day 3` <dbl>, `Day 4` <dbl>,
## #   `Day 5` <dbl>, `Day 6` <dbl>, `Day 7` <dbl>, `Day 8` <dbl>, `Day 9` <dbl>,
## #   `Day 10` <dbl>, `Day 11` <dbl>, `Day 12` <dbl>, `Day 13` <dbl>,
## #   `Day 14` <dbl>
#plot of correlation matrix from Day 1 to Day 14 columns
correlation_matrix <- cor(wide_data[, 17:30])

correlation_matrix <- round(correlation_matrix, 2)
correlation_matrix
##        Day 1 Day 2 Day 3 Day 4 Day 5 Day 6 Day 7 Day 8 Day 9 Day 10 Day 11
## Day 1   1.00  0.75  0.79  0.76  0.65  0.71  0.77  0.71  0.66   0.50   0.65
## Day 2   0.75  1.00  0.89  0.75  0.77  0.74  0.80  0.72  0.61   0.43   0.60
## Day 3   0.79  0.89  1.00  0.82  0.74  0.72  0.76  0.70  0.66   0.47   0.65
## Day 4   0.76  0.75  0.82  1.00  0.79  0.79  0.82  0.70  0.67   0.45   0.64
## Day 5   0.65  0.77  0.74  0.79  1.00  0.88  0.81  0.70  0.59   0.35   0.57
## Day 6   0.71  0.74  0.72  0.79  0.88  1.00  0.86  0.72  0.63   0.43   0.63
## Day 7   0.77  0.80  0.76  0.82  0.81  0.86  1.00  0.82  0.69   0.51   0.74
## Day 8   0.71  0.72  0.70  0.70  0.70  0.72  0.82  1.00  0.88   0.77   0.83
## Day 9   0.66  0.61  0.66  0.67  0.59  0.63  0.69  0.88  1.00   0.84   0.81
## Day 10  0.50  0.43  0.47  0.45  0.35  0.43  0.51  0.77  0.84   1.00   0.85
## Day 11  0.65  0.60  0.65  0.64  0.57  0.63  0.74  0.83  0.81   0.85   1.00
## Day 12  0.70  0.65  0.69  0.63  0.61  0.68  0.74  0.83  0.79   0.79   0.89
## Day 13  0.71  0.61  0.65  0.64  0.62  0.66  0.70  0.82  0.89   0.80   0.83
## Day 14  0.62  0.53  0.59  0.49  0.47  0.52  0.67  0.67  0.68   0.62   0.74
##        Day 12 Day 13 Day 14
## Day 1    0.70   0.71   0.62
## Day 2    0.65   0.61   0.53
## Day 3    0.69   0.65   0.59
## Day 4    0.63   0.64   0.49
## Day 5    0.61   0.62   0.47
## Day 6    0.68   0.66   0.52
## Day 7    0.74   0.70   0.67
## Day 8    0.83   0.82   0.67
## Day 9    0.79   0.89   0.68
## Day 10   0.79   0.80   0.62
## Day 11   0.89   0.83   0.74
## Day 12   1.00   0.88   0.83
## Day 13   0.88   1.00   0.77
## Day 14   0.83   0.77   1.00

VISUALIZE THE CORRELATION MATRIX USING A HEATMAP

#plot of correlation matrix heat map from Day 1 to Day 14 columns
library(reshape2) # for melt function

correlation_matrix_melted <- melt(correlation_matrix)

p <- ggplot(correlation_matrix_melted, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_viridis() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "right") +
  labs(title = "Correlation Matrix Heatmap",
       x = "Day",
       y = "Day")

print(p)

VISUALIZE THE CORRELATION MATRIX USING A PAIRWISE SCATTERPLOT

ggpairs(wide_data, mapping = aes(), columns = 17:20,
        lower = list(continuous = "smooth"))

CORRELATION BY SITE

ggpairs(wide_data, mapping = aes(color = site), columns = 17:20,
        lower = list(continuous = "smooth"))

SUMMARY STATISTICS

#summarize cough_Day by mean, standard deviation, median, and IQR
df_summary <- df %>%
  summarise(mean = mean(cough_Day),
            sd = sd(cough_Day),
            median = median(cough_Day),
            q1 = quantile(cough_Day, 0.25),
            q3 = quantile(cough_Day, 0.75))

print(df_summary)
## # A tibble: 1 × 5
##    mean    sd median    q1    q3
##   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1  169.  244.     70     6   240

PLOT INDIVIDUAL PROFILES OF COUGHS USING A SPAGHETTI PLOT

# Create the spaghetti plot
p <- ggplot(df, aes(x = Day, y = cough_Day, group = ID, color = as.factor(ID))) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Individual profiles of coughs",
       x = "Time (Days)",
       y = "Coughs") +
  theme_update(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none")

print(p)

PLOT INDIVIDUAL PROFILES OF COUGHS BY STUDY SITE

#create a spaghetti plot by site
p <- ggplot(df, aes(x = Day, y = cough_Day, group = ID, color = site)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Individual profiles of coughs by site",
       x = "Time (Days)",
       y = "Coughs") +
  theme_update(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "bottom")
print(p)

VISUALIZE THE DISTRIBUTION OF COUGHS USING A HISTOGRAM

#HISTOGRAM - cough_Day
#Visualize the distribution of cough_Day (whether there is a violation of the multivariate normality assumption);
#It is difficult to visualize normality of a vector (multiple time points = multiple axes)
#We can look at the overall distribution (as if there is only one time point)
#if we see deviations from Normality here, there are likely deviations at the multivariate level;
#Note: number of 30 bins are selected automatically in this code
p2 <- ggplot(data=df, aes(x=cough_Day))
p2 + geom_histogram(color="black", fill="lightblue", bins=20) +
  ggtitle("Distribution of Coughs")

#Add a kernel density line using geom_density and a Normal curve using stat_function
#Note: histogram binwidth is calculated directly from density function
#suggested bimodal distribution?

p2 + geom_histogram(aes(y=..density..),  color="black", fill="lightblue", binwidth=density(df$cough_Day)$bw) +
  geom_density( linetype="dashed") + 
  ggtitle("Distribution of coughs")+
  stat_function(fun = dnorm, args = list(mean=mean(df$cough_Day), sd=sd(df$cough_Day)))

#OVERLAYED TRANSPARENT HISTOGRAM of cough_Day by site
#If the bimodal pattern persists by group, this is more likely to be a data artificial (ex. missing people in study), rather than true non-normality
p2 <- ggplot(data=df, aes(x=cough_Day, fill=site))
p2 + geom_histogram(color="black", alpha=0.5, position="identity", bins = 20) +
  labs(title="Distribution of coughs by study site", x="coughs", fill="site")

p3 <- ggplot(data=df, aes(x=cough_Day))
p3 + geom_histogram(aes(y = stat(density), color=site, fill=site), alpha=0.5, position="identity", bins = 20) +
  geom_density(aes(color = site), linetype="dashed") + 
  labs(title="Distribution of coughs by study site",
       x="Coughs", color="site", fill="site")

VISUALIZE THE DISTRIBUTION OF COUGHS BY STUDY SITE USING A BOX PLOT

#BOX PLOT - cough_Day by site

p4 <- ggplot(data=df, aes(y=cough_Day, x=site))
p4 + geom_boxplot() + 
  labs(title="Distribution of coughs by site", 
                           x="site", y="coughs")+stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") 

#R defines outliers that are more extreme than +/- 1.5(IQR)
#Note: the identification of outliers is SUBJECTIVE!!! Do not simply discard data outside the fence - know your subject matter.
#Check whether the potential outliers are actually biologically plausible or whether they are data entry errors!

VISUALIZE THE DISTRIBUTION OF COUGHS BY TIME PERIOD USING A BOX PLOT

#BOX PLOT - cough_Day by Day

p4 <- ggplot(data=df, aes(y=cough_Day, x=Day))
p4 + geom_boxplot() + 
  labs(title="Distribution of coughs by Time", 
                           x="Time (Days)", y="coughs")+stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="red", fill="red") 

#boxplot of cough_Day by Day and site
p <- ggplot(df, aes(x = Day, y = cough_Day, fill = site)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Coughs by Time and Site",
       x = "Time (Days)",
       y = "Coughs") +
  theme(plot.title = element_text(hjust = 0.5))+stat_summary(fun.y=mean, geom="point", shape=20, size=3, color="blue", fill="blue") 

print(p)

VISUALIZE THE MEAN COUGHS BY TIME PERIOD USING A LINE PLOT

# Calculate the mean and standard error of cough_Day by Day
df_summary <- df %>%
  group_by(Day) %>%
  summarise(mean = mean(cough_Day),
            se = sd(cough_Day) / sqrt(n()))

# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = mean, group = 1, color = Day)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
  theme_minimal() +
  labs(title = "Mean Coughs by Time",
       x = "Time (Days)",
       y = "Coughs") +
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none")

print(p)

VISUALIZE THE MEAN COUGHS BY TIME PERIOD AND STUDY SITE USING A LINE PLOT

# Calculate the mean and standard error of cough_Day by Day and site
df_summary <- df %>%
  group_by(Day, site) %>%
  summarise(mean = mean(cough_Day),
            se = sd(cough_Day) / sqrt(n()),
            .groups = 'drop')  # Ensure summarise doesn't keep grouping

# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = mean, group = site, color = site)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
  theme_minimal() +
  labs(title = "Mean Coughs by Time and Site",
       x = "Time (Days)",
       y = "Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

VISUALIZE THE MEDIAN COUGHS BY TIME PERIOD USING A LINE PLOT

# Calculate the median and IQR of cough_Day by Day
df_summary <- df %>%
  group_by(Day) %>%
  summarise(median = median(cough_Day),
            q1 = quantile(cough_Day, 0.25),
            q3 = quantile(cough_Day, 0.75))

# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = median, group = 1, color = Day)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
  theme_minimal() +
  labs(title = "Median Coughs by Time",
       x = "Time (Days)",
       y = "Coughs") +
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none")

print(p)

VISUALIZE THE MEDIAN COUGHS BY TIME PERIOD AND STUDY SITE USING A LINE PLOT

# Calculate the median and IQR of cough_Day by Day and site
df_summary <- df %>%
  group_by(Day, site) %>%
  summarise(median = median(cough_Day),
            q1 = quantile(cough_Day, 0.25),
            q3 = quantile(cough_Day, 0.75),
            .groups = 'drop')  # Ensure summarise doesn't keep grouping

# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = median, group = site, color = site)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
  theme_minimal() +
  labs(title = "Median Coughs by Time and Site",
       x = "Time (Days)",
       y = "Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

VISUALIZE THE COUGHS BY AGE AND STUDY SITE USING A SCATTER PLOT

#scatter plot of cough_Day by Ageyrs and site
p <- ggplot(df, aes(x = Ageyrs, y = cough_Day, color = site)) +
  geom_point() +
  theme_minimal() +
  labs(title = "Coughs by Age and Site",
       x = "Age (Years)",
       y = "Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

Lets try to transform the OUTCOME using natural log to see if we can get a better model fit. We shall include only finite values in the transformation.

#create a variable of log transformed cough_Day round to 2 decimal places include only finite values
df$log_cough_Day <- round(log(df$cough_Day), 2)
df3 <- df
df3 <- df3[is.finite(df$log_cough_Day), ]
#plot the histogram of log transformed cough_Day
p <- ggplot(data=df3, aes(x=log_cough_Day))
p + geom_histogram(color="black", fill="lightblue", bins=15) +
  ggtitle("Distribution of log transformed coughs")

#Add a kernel density line using geom_density and a Normal curve using stat_function
p + geom_histogram(aes(y=..density..),  color="black", fill="lightblue", binwidth=density(df3$log_cough_Day)$bw) +
  geom_density( linetype="dashed") + 
  ggtitle("Distribution of log transformed coughs")+
  stat_function(fun = dnorm, args = list(mean=mean(df3$log_cough_Day), sd=sd(df3$log_cough_Day)))

#summarize the log transformed cough_Day by mean, standard deviation, median, and IQR
df_summary <- df3 %>%
  summarise(mean = mean(log_cough_Day),
            sd = sd(log_cough_Day),
            median = median(log_cough_Day),
            q1 = quantile(log_cough_Day, 0.25),
            q3 = quantile(log_cough_Day, 0.75))

print(df_summary)
## # A tibble: 1 × 5
##    mean    sd median    q1    q3
##   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1  4.48  1.56    4.7  3.53   5.7
#spaghetti plot of log transformed cough_Day
p <- ggplot(df3, aes(x = Day, y = log_cough_Day, group = ID, color = as.factor(ID))) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Individual profiles of log transformed coughs",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none")

print(p)

#Average trendline of log transformed cough_Day by Day using a loess
df4 <- df3
df4$Day <- as.numeric(df4$Day)


p <- ggplot(df4, aes(x = Day, y = log_cough_Day)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, formula = y~x) +
  theme_minimal() +
  labs(title = "Average trendline of log transformed coughs by Time",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

#overlayed transparent histogram of log transformed cough_Day by site
p2 <- ggplot(data=df3, aes(x=log_cough_Day, fill=site))
p2 + geom_histogram(color="black", alpha=0.5, position="identity", bins = 15) +
  labs(title="Distribution of log transformed coughs by study site", x="log coughs", fill="site")

p3 <- ggplot(data=df3, aes(x=log_cough_Day))
p3 + geom_histogram(aes(y = stat(density), color=site, fill=site), alpha=0.5, position="identity", bins = 15) +
  geom_density(aes(color = site), linetype="dashed") + 
  labs(title="Distribution of log transformed coughs by study site",
       x="Log Coughs", color="site", fill="site")

#boxplot of log transformed cough_Day by Day and site
p <- ggplot(df3, aes(x = Day, y = log_cough_Day, fill = site)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Log Coughs by Time and Site",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

# Calculate the mean and standard error of log_cough_Day by Day
df_summary <- df3 %>%
  group_by(Day) %>%
  summarise(mean = mean(log_cough_Day),
            se = sd(log_cough_Day) / sqrt(n()))

# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = mean, group = 1, color = Day)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
  theme_minimal() +
  labs(title = "Mean Log Coughs by Time",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none")

print(p)

# Calculate the mean and standard error of log_cough_Day by Day and site
df_summary <- df3 %>%
  group_by(Day, site) %>%
  summarise(mean = mean(log_cough_Day),
            se = sd(log_cough_Day) / sqrt(n()), 
            .groups = 'drop')  # Ensure summarise doesn't keep grouping

# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = mean, group = site, color = site)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = mean - se, ymax = mean + se), width = 0.1) +
  theme_minimal() +
  labs(title = "Mean Log Coughs by Time and Site",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

# Calculate the median and IQR of log_cough_Day by Day
df_summary <- df3 %>%
  group_by(Day) %>%
  summarise(median = median(log_cough_Day),
            q1 = quantile(log_cough_Day, 0.25),
            q3 = quantile(log_cough_Day, 0.75))

# Create the line plot
p <- ggplot(df_summary, aes(x = Day, y = median, group = 1, color = Day)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
  theme_minimal() +
  labs(title = "Median Log Coughs by Time",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))+
  theme(legend.position = "none")

print(p)

# Calculate the median and IQR of log_cough_Day by Day and site
df_summary <- df3 %>%
  group_by(Day, site) %>%
  summarise(median = median(log_cough_Day),
            q1 = quantile(log_cough_Day, 0.25),
            q3 = quantile(log_cough_Day, 0.75), 
            .groups = 'drop')  # Ensure summarise doesn't keep grouping

# Create the line plot separated by site and colored by site
p <- ggplot(df_summary, aes(x = Day, y = median, group = site, color = site)) +
  geom_line() +
  geom_point() +
  geom_errorbar(aes(ymin = q1, ymax = q3), width = 0.1) +
  theme_minimal() +
  labs(title = "Median Log Coughs by Time and Site",
       x = "Time (Days)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

#scatter plot of log_cough_Day by Ageyrs and site
p <- ggplot(df3, aes(x = Ageyrs, y = log_cough_Day, color = site)) +
  geom_point( aes(shape = site)) +
  theme_minimal() +
  labs(title = "Log Coughs by Age and Site",
       x = "Age (Years)",
       y = "Log Coughs") +
  theme(plot.title = element_text(hjust = 0.5))

print(p)

Reminding ourselves about the necessary libraries

library(lme4) # for linear mixed-effects models
library(lmerTest) # for p-values of linear mixed-effects models
library(glmmTMB) # for Poisson mixed-effects models

REGRESSION ANALYSIS

  1. GLM MODEL This model does not include random effects. If you suspect that there is random variability at the site or individual level beyond what is captured by the fixed effects, a mixed-effects model (e.g., using glmer or glmmTMB) might be more appropriate.)
#convert Day to numeric
df$Day <- as.numeric(df$Day)

# Fit the GLM with Poisson distribution and log link, Day as continuous
poisson_model_cont <- glm(cough_Day ~ Day + site + ID, 
                          family = poisson(link = "log"), 
                          data = df)
# Check for overdispersion
dispersion <- sum(residuals(poisson_model_cont, type = "pearson")^2) / df.residual(poisson_model_cont)
print(paste("Dispersion parameter:", dispersion))
## [1] "Dispersion parameter: 287.10163875548"
# Summary of the model
summary_cont <- summary(poisson_model_cont)
print(summary_cont)
## 
## Call:
## glm(formula = cough_Day ~ Day + site + ID, family = poisson(link = "log"), 
##     data = df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.074e+00  7.150e-03  709.64   <2e-16 ***
## Day         -3.712e-02  5.134e-04  -72.31   <2e-16 ***
## siteMulago  -3.578e-01  4.859e-03  -73.65   <2e-16 ***
## ID           8.573e-03  8.155e-05  105.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 380005  on 1399  degrees of freedom
## Residual deviance: 343954  on 1396  degrees of freedom
## AIC: 351119
## 
## Number of Fisher Scoring iterations: 6

Plot the residuals for the continuous Day model

# Extract residuals
residuals_poisson_cont <- residuals(poisson_model_cont, type = "deviance")

# Plot residuals for continuous Day model
ggplot(data = df, aes(x = seq_along(residuals_poisson_cont), y = residuals_poisson_cont)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residuals (GLM Poisson - Day as Continuous)",
       x = "Index",
       y = "Deviance Residuals") +
  theme_minimal()

Lets fit a GLM negative binomial mixed effects model

# Install and load necessary library
#install.packages("MASS")
library(MASS)
#convert Day to numeric
df$Day <- as.numeric(df$Day)

# Fit the Negative Binomial model
nb_model_cont <- glm.nb(cough_Day ~ Day + site + ID, data = df)

# Summary of the model
summary(nb_model_cont)
## 
## Call:
## glm.nb(formula = cough_Day ~ Day + site + ID, data = df, init.theta = 0.3320238524, 
##     link = log)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.139069   0.156147  32.912  < 2e-16 ***
## Day         -0.048277   0.011519  -4.191 2.78e-05 ***
## siteMulago  -0.324079   0.103386  -3.135  0.00172 ** 
## ID           0.008581   0.001790   4.793 1.64e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.332) family taken to be 1)
## 
##     Null deviance: 1773.6  on 1399  degrees of freedom
## Residual deviance: 1700.7  on 1396  degrees of freedom
## AIC: 15685
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.3320 
##           Std. Err.:  0.0118 
## 
##  2 x log-likelihood:  -15675.1320
# Check dispersion
dispersion_nb <- sum(residuals(nb_model_cont, type = "pearson")^2) / df.residual(nb_model_cont)
print(paste("Dispersion parameter (Negative Binomial):", dispersion_nb))
## [1] "Dispersion parameter (Negative Binomial): 0.569725366742915"

plot the residuals for the negative binomial model

# Extract residuals
residuals_nb_cont <- residuals(nb_model_cont, type = "deviance")

# Plot residuals
ggplot(data = df, aes(x = seq_along(residuals_nb_cont), y = residuals_nb_cont)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residuals (Negative Binomial - Day as Continuous)",
       x = "Index",
       y = "Deviance Residuals") +
  theme_minimal()

  1. RANDOM INTERCEPT MODEL A random intercept model allows the intercept to vary across groups or subjects. This means each group or subject has its own baseline level of the response variable, but the effect of the predictors (slopes) is assumed to be the same across all groups or subjects.

Fixed Effects: Day: The effect of Day on cough_Day is consistent across all sites and individuals. Random Effects: (1 | site): The intercept can vary across sites. Each site has its own baseline level of cough_Day. (1 | ID): The intercept can vary across individuals. Each individual has their own baseline level of cough_Day.

This model allows for variability in both the baseline count of coughs (intercept) and the effect of Day (slope) across sites, but only variability in the intercept across individuals. Suitable if you believe both the baseline cough count and the rate of change (slope) with Day vary by site.

#lets convert Day to factor
df$Day <- as.factor(df$Day)

# Fit the Random-Intercept Poisson mixed-effects model
random_intercept_model <- glmer(cough_Day ~ Day + (1 | site) + (1 | ID), family = poisson(link = "log"), data = df)
## boundary (singular) fit: see help('isSingular')
# Summary of the model
summary(random_intercept_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: cough_Day ~ Day + (1 | site) + (1 | ID)
##    Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##  94487.6  94571.5 -47227.8  94455.6     1384 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -26.145  -4.197  -1.302   2.886  57.346 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  ID     (Intercept) 2.852    1.689   
##  site   (Intercept) 0.000    0.000   
## Number of obs: 1400, groups:  ID, 100; site, 2
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.065127   0.169220  24.023  < 2e-16 ***
## Day2         0.493119   0.010406  47.387  < 2e-16 ***
## Day3         0.485492   0.010421  46.586  < 2e-16 ***
## Day4         0.310568   0.010794  28.772  < 2e-16 ***
## Day5         0.184326   0.011097  16.610  < 2e-16 ***
## Day6         0.044712   0.011468   3.899 9.67e-05 ***
## Day7         0.002954   0.011587   0.255   0.7987    
## Day8         0.158780   0.011162  14.225  < 2e-16 ***
## Day9         0.025757   0.011522   2.235   0.0254 *  
## Day10        0.070485   0.011397   6.185 6.22e-10 ***
## Day11       -0.007288   0.011617  -0.627   0.5304    
## Day12        0.116774   0.011272  10.360  < 2e-16 ***
## Day13       -0.066230   0.011793  -5.616 1.95e-08 ***
## Day14       -0.340259   0.012717 -26.757  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation matrix not shown by default, as p = 14 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

extract the fixed effect coefficients

# Extract the fixed effect coefficients
fixed_effects <- fixef(random_intercept_model)

# Exponentiate the fixed effect coefficients
exp_fixed_effects <- exp(fixed_effects)

# Print the exponentiated coefficients
exp_fixed_effects
## (Intercept)        Day2        Day3        Day4        Day5        Day6 
##  58.2723048   1.6374161   1.6249749   1.3641995   1.2024080   1.0457267 
##        Day7        Day8        Day9       Day10       Day11       Day12 
##   1.0029588   1.1720797   1.0260912   1.0730280   0.9927381   1.1238657 
##       Day13       Day14 
##   0.9359157   0.7115861
# Plot residuals for the glmer model
plot(residuals(random_intercept_model, type = "pearson"), main = "Residuals (randon intercept)")

#add a smooth line to the plot
lines(lowess(residuals(random_intercept_model, type = "pearson")), col = "red")

  1. RANDOM COEFFICIENT MODEL A random coefficient model (also called a random slope model) allows both the intercept and the slope to vary across groups or subjects. This means each group or subject can have its own baseline level of the response variable and its own effect of the predictor.

Fixed Effects:

Day: The overall effect of Day on cough_Day. Random Effects:

(Day | site): Both the intercept and the slope of Day can vary across sites. Each site has its own baseline level of cough_Day and its own effect of Day. (1 | ID): The intercept can vary across individuals. Each individual has their own baseline level of cough_Day.

This model allows for variability in the baseline count of coughs (intercept) across sites and variability in both the baseline count of coughs (intercept) and the effect of Day (slope) across individuals.

# Fit the Random-Coefficient Poisson mixed-effects model
random_coefficient_model <- glmer(cough_Day ~ Day + (Day | site) + (1| ID), family = poisson(link = "log"), data = df)

# Summary of the model
summary(random_coefficient_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: cough_Day ~ Day + (Day | site) + (1 | ID)
##    Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##  92816.4  93445.7 -46288.2  92576.4     1280 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -27.628  -3.982  -1.276   2.569  53.009 
## 
## Random effects:
##  Groups Name        Variance Std.Dev. Corr                                   
##  ID     (Intercept) 2.80583  1.6751                                          
##  site   (Intercept) 0.03509  0.1873                                          
##         Day2        0.04096  0.2024   0.82                                   
##         Day3        0.02790  0.1670   0.81 1.00                              
##         Day4        0.03307  0.1818   0.39 0.63 0.65                         
##         Day5        0.41042  0.6406   0.45 0.64 0.65 0.83                    
##         Day6        0.21812  0.4670   0.36 0.56 0.57 0.87 0.99               
##         Day7        0.30276  0.5502   0.55 0.78 0.79 0.90 0.78 0.79          
##         Day8        0.49362  0.7026   0.33 0.49 0.50 0.71 0.84 0.84 0.82     
##         Day9        0.42300  0.6504   0.35 0.49 0.50 0.65 0.79 0.78 0.81 0.99
##         Day10       0.65959  0.8122   0.39 0.54 0.55 0.67 0.78 0.77 0.85 0.99
##         Day11       0.99829  0.9991   0.27 0.37 0.38 0.49 0.63 0.62 0.57 0.75
##         Day12       0.65517  0.8094   0.27 0.40 0.41 0.56 0.65 0.65 0.67 0.83
##         Day13       0.93183  0.9653   0.31 0.42 0.42 0.53 0.69 0.67 0.61 0.80
##         Day14       0.92247  0.9605   0.32 0.50 0.51 0.74 0.74 0.76 0.80 0.86
##                          
##                          
##                          
##                          
##                          
##                          
##                          
##                          
##                          
##                          
##                          
##  1.00                    
##  0.76 0.75               
##  0.83 0.83 0.99          
##  0.80 0.79 1.00 0.99     
##  0.84 0.85 0.93 0.97 0.94
## Number of obs: 1400, groups:  ID, 100; site, 2
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.072425   0.213842  19.044  < 2e-16 ***
## Day2         0.471765   0.143499   3.288  0.00101 ** 
## Day3         0.469919   0.118577   3.963  7.4e-05 ***
## Day4         0.318058   0.129041   2.465  0.01371 *  
## Day5         0.145743   0.453146   0.322  0.74774    
## Day6         0.036622   0.330448   0.111  0.91175    
## Day7        -0.008039   0.389252  -0.021  0.98352    
## Day8         0.109320   0.496927   0.220  0.82588    
## Day9        -0.034802   0.460038  -0.076  0.93970    
## Day10       -0.013420   0.574401  -0.023  0.98136    
## Day11       -0.089970   0.706604  -0.127  0.89868    
## Day12        0.069217   0.572467   0.121  0.90376    
## Day13       -0.159286   0.682688  -0.233  0.81551    
## Day14       -0.356361   0.679267  -0.525  0.59984    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## optimizer (Nelder_Mead) convergence code: 4 (failure to converge in 10000 evaluations)
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 24 negative eigenvalues
## failure to converge in 10000 evaluations
# Extract and exponentiate the fixed effect coefficients
fixed_effects <- fixef(random_coefficient_model)
exp_fixed_effects <- exp(fixed_effects)

# Print the exponentiated coefficients
exp_fixed_effects
## (Intercept)        Day2        Day3        Day4        Day5        Day6 
##  58.6991195   1.6028207   1.5998648   1.3744554   1.1568992   1.0373009 
##        Day7        Day8        Day9       Day10       Day11       Day12 
##   0.9919928   1.1155190   0.9657963   0.9866700   0.9139586   1.0716686 
##       Day13       Day14 
##   0.8527528   0.7002199
# Plot residuals for the glmer model
plot(residuals(random_coefficient_model, type = "pearson"), main = "Residuals random coeff")

#add a smooth line to the plot
lines(lowess(residuals(random_coefficient_model, type = "pearson")), col = "red")

  1. RANDOM COEFFICIENT MODEL BY ID AND TIME Fixed Effects:

Day: The overall effect of Day on cough_Day. Random Effects:

(1 | site): The intercept can vary across sites. Each site has its own baseline level of cough_Day. (Day | ID): Both the intercept and the slope of Day can vary across individuals. Each individual has their own baseline level of cough_Day and their own effect of Day.

This model allows for variability in the baseline count of coughs (intercept) across sites and variability in both the baseline count of coughs (intercept) and the effect of Day (slope) across individuals. Suitable if you believe the baseline cough count varies by site and the baseline cough count and rate of change with Day vary by individual.

#load library
library(glmmTMB)

# Fit the Random-Coefficients Poisson mixed-effects model using glmmTMB
poisson_random_coefficients_model_tmb_ID <- glmmTMB(cough_Day ~ Day + (1 | site) + (Day | ID), 
                                                    family = poisson, 
                                                    data = df)

# Generate Pearson residuals
residuals_tmb_ID <- residuals(poisson_random_coefficients_model_tmb_ID, type = "pearson")

# Create residual plot
plot(residuals_tmb_ID, main = "Residuals (glmmTMB Random Coefficients Model)", 
     ylab = "Pearson Residuals", xlab = "Index", pch = 16, cex = 0.6)
abline(h = 0, col = "red")

#compare the models
anova(random_intercept_model, random_coefficient_model, test = "Chisq")
## Data: df
## Models:
## random_intercept_model: cough_Day ~ Day + (1 | site) + (1 | ID)
## random_coefficient_model: cough_Day ~ Day + (Day | site) + (1 | ID)
##                          npar   AIC   BIC logLik deviance  Chisq  Df Pr(>Chisq)
## random_intercept_model     16 94488 94571 -47228    94456                      
## random_coefficient_model  120 92816 93446 -46288    92576 1879.2 104  < 2.2e-16
##                             
## random_intercept_model      
## random_coefficient_model ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since the number of coughs varies between individuals, Day, and site, the Poisson Random Coefficients Model using glmmTMB appears to be the most appropriate choice. This model will account for variability in the baseline count of coughs across sites and both the baseline count and the effect of Day across individuals, providing a more flexible and accurate representation of our data.

#compare AIC and BIC
AIC(random_intercept_model, random_coefficient_model, poisson_random_coefficients_model_tmb_ID)
##                                           df      AIC
## random_intercept_model                    16 94487.56
## random_coefficient_model                 120 92816.37
## poisson_random_coefficients_model_tmb_ID 120 14019.78
BIC(random_intercept_model, random_coefficient_model, poisson_random_coefficients_model_tmb_ID)
##                                           df      BIC
## random_intercept_model                    16 94571.47
## random_coefficient_model                 120 93445.68
## poisson_random_coefficients_model_tmb_ID 120 14649.09

Compare GEE and GLMM

GEE Models fixed effects like Day but assumes the effect is consistent across all observations GLMM Models fixed effects like Day but allows the effect to vary across groups or subjects (random effects)

# Load necessary libraries
library(broom.mixed)

# Fit the glmmTMB model
glmmTMB_model <- glmmTMB(
  cough_Day ~ Day  + 
    (1 | site) + (Day | ID), 
  family = poisson, 
  data = df,
  control = glmmTMBControl(optCtrl = list(iter.max = 10000, eval.max = 10000))
)

# Create a combined clustering variable
df <- df %>%
  mutate(cluster = interaction(site, ID, drop = TRUE))

# Fit the GEE model with the combined clustering variable
gee_model <- geeglm(
  cough_Day ~ Day , 
  family = poisson, 
  data = df, 
  id = cluster,  # Combined clustering variable
  corstr = "exchangeable"  # Correlation structure
)
# Extract and print fixed effects for glmmTMB
glmmTMB_fixed_effects <- broom.mixed::tidy(glmmTMB_model, effects = "fixed") %>%
  mutate(p.value = round(p.value, 3))
print(glmmTMB_fixed_effects)
## # A tibble: 14 × 7
##    effect component term        estimate std.error statistic p.value
##    <chr>  <chr>     <chr>          <dbl>     <dbl>     <dbl>   <dbl>
##  1 fixed  cond      (Intercept)   4.45       0.148    30.1     0    
##  2 fixed  cond      Day2          0.217      0.114     1.90    0.057
##  3 fixed  cond      Day3         -0.0165     0.142    -0.117   0.907
##  4 fixed  cond      Day4         -0.552      0.203    -2.72    0.007
##  5 fixed  cond      Day5         -1.08       0.231    -4.68    0    
##  6 fixed  cond      Day6         -1.52       0.301    -5.05    0    
##  7 fixed  cond      Day7         -1.74       0.318    -5.48    0    
##  8 fixed  cond      Day8         -1.66       0.307    -5.43    0    
##  9 fixed  cond      Day9         -2.47       0.418    -5.91    0    
## 10 fixed  cond      Day10        -1.97       0.344    -5.72    0    
## 11 fixed  cond      Day11        -1.58       0.292    -5.41    0    
## 12 fixed  cond      Day12        -1.90       0.355    -5.34    0    
## 13 fixed  cond      Day13        -2.03       0.349    -5.82    0    
## 14 fixed  cond      Day14        -2.29       0.319    -7.17    0
# Extract and print coefficients for GEE model
gee_fixed_effects <- broom::tidy(gee_model) %>%
  mutate(p.value = round(p.value, 3))
print(gee_fixed_effects)
## # A tibble: 14 × 5
##    term        estimate std.error   statistic p.value
##    <chr>          <dbl>     <dbl>       <dbl>   <dbl>
##  1 (Intercept)  5.00       0.0997 2517.         0    
##  2 Day2         0.493      0.0718   47.2        0    
##  3 Day3         0.485      0.0681   50.8        0    
##  4 Day4         0.311      0.0819   14.4        0    
##  5 Day5         0.184      0.139     1.75       0.186
##  6 Day6         0.0447     0.106     0.178      0.673
##  7 Day7         0.00295    0.0950    0.000967   0.975
##  8 Day8         0.159      0.111     2.06       0.151
##  9 Day9         0.0258     0.117     0.0487     0.825
## 10 Day10        0.0705     0.150     0.221      0.638
## 11 Day11       -0.00729    0.107     0.00466    0.946
## 12 Day12        0.117      0.103     1.29       0.256
## 13 Day13       -0.0662     0.107     0.382      0.537
## 14 Day14       -0.340      0.142     5.77       0.016
# Summary for glmmTMB model
summary(glmmTMB_model)
##  Family: poisson  ( log )
## Formula:          cough_Day ~ Day + (1 | site) + (Day | ID)
## Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##  14019.8  14649.1  -6889.9  13779.8     1280 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr                                   
##  site   (Intercept)  0.01603 0.1266                                          
##  ID     (Intercept)  1.35150 1.1625                                          
##         Day2         1.22333 1.1060   0.12                                   
##         Day3         1.88846 1.3742   0.25 0.81                              
##         Day4         3.74011 1.9339   0.24 0.43 0.72                         
##         Day5         4.70845 2.1699   0.31 0.39 0.56 0.74                    
##         Day6         7.63363 2.7629   0.28 0.34 0.46 0.63 0.86               
##         Day7         8.38341 2.8954   0.40 0.35 0.49 0.54 0.76 0.87          
##         Day8         7.77778 2.7889   0.48 0.39 0.48 0.46 0.67 0.74 0.94     
##         Day9        14.29932 3.7814   0.45 0.43 0.51 0.53 0.71 0.80 0.94 0.95
##         Day10        9.68098 3.1114   0.43 0.42 0.48 0.48 0.58 0.69 0.85 0.88
##         Day11        7.09321 2.6633   0.35 0.32 0.38 0.43 0.55 0.71 0.85 0.87
##         Day12       10.48209 3.2376   0.40 0.42 0.46 0.43 0.59 0.73 0.85 0.87
##         Day13        9.78080 3.1274   0.36 0.46 0.46 0.42 0.63 0.73 0.83 0.85
##         Day14        7.98886 2.8265   0.45 0.44 0.51 0.39 0.52 0.65 0.82 0.85
##                           
##                           
##                           
##                           
##                           
##                           
##                           
##                           
##                           
##                           
##                           
##  0.95                     
##  0.89 0.94                
##  0.92 0.97 0.96           
##  0.91 0.92 0.91 0.95      
##  0.89 0.92 0.89 0.93 0.92 
## Number of obs: 1400, groups:  site, 2; ID, 100
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.44848    0.14756  30.147  < 2e-16 ***
## Day2         0.21730    0.11433   1.901  0.05735 .  
## Day3        -0.01655    0.14198  -0.117  0.90722    
## Day4        -0.55179    0.20298  -2.718  0.00656 ** 
## Day5        -1.08119    0.23105  -4.679 2.88e-06 ***
## Day6        -1.51704    0.30065  -5.046 4.51e-07 ***
## Day7        -1.74198    0.31810  -5.476 4.35e-08 ***
## Day8        -1.66457    0.30664  -5.428 5.68e-08 ***
## Day9        -2.46822    0.41794  -5.906 3.51e-09 ***
## Day10       -1.96627    0.34392  -5.717 1.08e-08 ***
## Day11       -1.58190    0.29228  -5.412 6.22e-08 ***
## Day12       -1.89681    0.35549  -5.336 9.52e-08 ***
## Day13       -2.02765    0.34867  -5.815 6.05e-09 ***
## Day14       -2.28655    0.31891  -7.170 7.50e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Summary for GEE model
summary(gee_model)
## 
## Call:
## geeglm(formula = cough_Day ~ Day, family = poisson, data = df, 
##     id = cluster, corstr = "exchangeable")
## 
##  Coefficients:
##              Estimate   Std.err     Wald Pr(>|W|)    
## (Intercept)  5.001998  0.099697 2517.253  < 2e-16 ***
## Day2         0.493119  0.071781   47.194 6.43e-12 ***
## Day3         0.485492  0.068125   50.787 1.03e-12 ***
## Day4         0.310567  0.081854   14.396 0.000148 ***
## Day5         0.184326  0.139234    1.753 0.185552    
## Day6         0.044712  0.105929    0.178 0.672957    
## Day7         0.002954  0.095011    0.001 0.975193    
## Day8         0.158780  0.110667    2.058 0.151359    
## Day9         0.025756  0.116665    0.049 0.825270    
## Day10        0.070485  0.149976    0.221 0.638375    
## Day11       -0.007289  0.106799    0.005 0.945587    
## Day12        0.116774  0.102852    1.289 0.256226    
## Day13       -0.066230  0.107227    0.382 0.536799    
## Day14       -0.340259  0.141625    5.772 0.016282 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = exchangeable 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    345.6    74.1
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha   0.6792 0.09026
## Number of clusters:   100  Maximum cluster size: 14
# AIC and BIC for glmmTMB model
aic_glmmTMB <- AIC(glmmTMB_model)
bic_glmmTMB <- BIC(glmmTMB_model)
cat("AIC for glmmTMB model:", aic_glmmTMB, "\n")
## AIC for glmmTMB model: 14020
cat("BIC for glmmTMB model:", bic_glmmTMB, "\n")
## BIC for glmmTMB model: 14649

glmmTB residual plot

# Plot residuals for glmmTMB model
glmmTMB_residuals <- residuals(glmmTMB_model, type = "pearson")
ggplot(data = df, aes(x = seq_along(glmmTMB_residuals), y = glmmTMB_residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residuals (glmmTMB Model)", x = "Index", y = "Pearson Residuals") +
  theme_minimal()

GEE residual plot

# Plot residuals for GEE model
gee_residuals <- residuals(gee_model, type = "pearson")
ggplot(data = df, aes(x = seq_along(gee_residuals), y = gee_residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  labs(title = "Residuals (GEE Model)", x = "Index", y = "Pearson Residuals") +
  theme_minimal()

categorical versus continuous day

  1. Continuous Day Model Linear Relationship: If there is a linear or approximately linear relationship between the variable and the outcome, it is appropriate to treat the variable as continuous. Example: If Day represents time and you expect the number of coughs to increase or decrease linearly over time.

  2. Categorical Day Model Non-linear Relationships: If the relationship between the variable and the outcome is non-linear or you expect different effects at different levels, it is appropriate to treat the variable as categorical. Example: If the effect of Day on the number of coughs varies significantly at different times (e.g., different trends in different periods)

df$Day <- as.numeric(df$Day)

# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day +(1 | site) + (Day | ID), 
                                                  family = poisson, 
                                                  data = df)

summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
##  Family: poisson  ( log )
## Formula:          cough_Day ~ Day + (1 | site) + (Day | ID)
## Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##    71215    71247   -35602    71203     1394 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  site   (Intercept) 0.0316   0.178         
##  ID     (Intercept) 1.3004   1.140         
##         Day         0.0827   0.288    0.25 
## Number of obs: 1400, groups:  site, 2; ID, 100
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   5.0623     0.1703   29.73  < 2e-16 ***
## Day          -0.1960     0.0294   -6.66  2.8e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#categorize BMI into underweight, normal, overweight, and obese
df$BMI_category <- cut(df$BMI, breaks = c(0, 18.5, 25, 30, Inf), labels = c("Underweight", "Normal", "Overweight", "Obese"))

#categorise Ageyrs into children, young adults, adults, and elderly
df$Age_category <- cut(df$Ageyrs, breaks = c(18, 40, 60, Inf), labels = c( "Young Adults", "Adults", "Elderly"))

table(df$Age_category)
## 
## Young Adults       Adults      Elderly 
##         1078          294           28

Bivariate analysis

Include all predictors for stepwise regression by backward elimination

df$Day <- as.numeric(df$Day)

# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + Ageyrs+ diabeticstatus + HIVStatus+Semiqtltyrslt + BCGscar + sex+Smoking+BMI+ (1 | site) + (Day | ID), 
                                                  family = poisson, 
                                                  data = df)

summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
##  Family: poisson  ( log )
## Formula:          
## cough_Day ~ Day + Ageyrs + diabeticstatus + HIVStatus + Semiqtltyrslt +  
##     BCGscar + sex + Smoking + BMI + (1 | site) + (Day | ID)
## Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##    71215    71305   -35591    71181     1383 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  site   (Intercept) 0.0668   0.259         
##  ID     (Intercept) 1.0516   1.025         
##         Day         0.0804   0.284    0.28 
## Number of obs: 1400, groups:  site, 2; ID, 100
## 
## Conditional model:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             2.8573     0.7416    3.85  0.00012 ***
## Day                    -0.1947     0.0290   -6.71  1.9e-11 ***
## Ageyrs                 -0.0011     0.0105   -0.11  0.91592    
## diabeticstatusYes       0.9957     0.7884    1.26  0.20661    
## HIVStatusPositive      -0.3612     0.2821   -1.28  0.20046    
## SemiqtltyrsltLow       -0.1080     0.2920   -0.37  0.71143    
## SemiqtltyrsltMedium     0.5013     0.2618    1.91  0.05555 .  
## SemiqtltyrsltTrace     -0.2231     0.4347   -0.51  0.60770    
## SemiqtltyrsltVery Low  -0.1744     0.3724   -0.47  0.63948    
## BCGscarYes              0.3003     0.2572    1.17  0.24292    
## sexMale                 0.7383     0.2281    3.24  0.00121 ** 
## SmokingYes              0.3578     0.3589    1.00  0.31879    
## BMI                     0.0754     0.0296    2.55  0.01089 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Round the coqefficient estimates to 2 decimal places and p-values to 3 decimal places

# Load necessary libraries
library(broom.mixed)
library(dplyr)

# Fit the initial full model
full_model <- glmmTMB(
  cough_Day ~ Day + Semiqtltyrslt + diabeticstatus +
    (1 | site) + (Day | ID), 
  family = poisson, 
  data = df,
  control = glmmTMBControl(optCtrl = list(iter.max = 10000, eval.max = 10000))
)

# Extract the fixed effects and p-values
tidy_full_model <- tidy(full_model, effects = "fixed") %>%
  mutate(p.value = round(p.value, 3))

# Compute confidence intervals
conf_int <- confint(full_model, parm = "beta_", level = 0.95)
conf_int <- as.data.frame(conf_int)
conf_int <- conf_int %>%
  rownames_to_column(var = "term") %>%
  rename(conf.low = `2.5 %`, conf.high = `97.5 %`)

# Join the confidence intervals with the model output
tidy_full_model <- left_join(tidy_full_model, conf_int, by = "term") %>%
  mutate(
    estimate = round(estimate, 2),
    std.error = round(std.error, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2),
    exp.estimate = round(exp(estimate), 2),
    exp.conf.low = round(exp(conf.low), 2),
    exp.conf.high = round(exp(conf.high), 2)
  )

# Select and rename columns for clarity
tidy_full_model <- tidy_full_model %>%
  dplyr::select(term, estimate, std.error, p.value, conf.low, conf.high, exp.estimate, exp.conf.low, exp.conf.high) %>%
  dplyr::rename(
    "Term" = term,
    "Estimate" = estimate,
    "Std. Error" = std.error,
    "P-value" = p.value,
    "CI Lower" = conf.low,
    "CI Upper" = conf.high,
    "Exp(Estimate)" = exp.estimate,
    "Exp(CI Lower)" = exp.conf.low,
    "Exp(CI Upper)" = exp.conf.high
  )

# Print the model coefficients, p-values, and exponentiated estimates with confidence intervals
print(tidy_full_model)
## # A tibble: 7 × 9
##   Term     Estimate `Std. Error` `P-value` `CI Lower` `CI Upper` `Exp(Estimate)`
##   <chr>       <dbl>        <dbl>     <dbl>      <dbl>      <dbl>           <dbl>
## 1 (Interc…     5.02         0.24     0           4.56       5.48          151.  
## 2 Day         -0.2          0.03     0          -0.25      -0.14            0.82
## 3 Semiqtl…    -0.18         0.31     0.559      -0.78       0.42            0.84
## 4 Semiqtl…     0.34         0.28     0.223      -0.2        0.88            1.4 
## 5 Semiqtl…    -0.37         0.45     0.416      -1.25       0.52            0.69
## 6 Semiqtl…    -0.1          0.39     0.805      -0.87       0.67            0.9 
## 7 diabeti…     1.28         0.81     0.112      -0.3        2.86            3.6 
## # ℹ 2 more variables: `Exp(CI Lower)` <dbl>, `Exp(CI Upper)` <dbl>
# Fit the initial full model
full_model <- glmmTMB(
  cough_Day ~ Day + Semiqtltyrslt + diabeticstatus +
    (1 | site) + (Day | ID), 
  family = poisson, 
  data = df,
  control = glmmTMBControl(optCtrl = list(iter.max = 10000, eval.max = 10000))
)

# Extract the fixed effects and p-values
tidy_full_model <- tidy(full_model, effects = "fixed") %>%
  mutate(p.value = round(p.value, 3))

# Compute confidence intervals
conf_int <- confint(full_model, parm = "beta_", level = 0.95)
conf_int <- as.data.frame(conf_int)
conf_int <- conf_int %>%
  rownames_to_column(var = "term") %>%
  rename(conf.low = `2.5 %`, conf.high = `97.5 %`)

# Join the confidence intervals with the model output
tidy_full_model <- left_join(tidy_full_model, conf_int, by = "term") %>%
  mutate(
    exp.estimate = round(exp(estimate), 2),
    exp.conf.low = round(exp(conf.low), 2),
    exp.conf.high = round(exp(conf.high), 2),
    estimate = round(estimate, 2),
    std.error = round(std.error, 2),
    conf.low = round(conf.low, 2),
    conf.high = round(conf.high, 2)
  )

# Select and rename columns for clarity
tidy_full_model <- tidy_full_model %>%
  dplyr::select(term, estimate, std.error, p.value, conf.low, conf.high, exp.estimate, exp.conf.low, exp.conf.high) %>%
  dplyr::rename(
    "Term" = term,
    "Estimate" = estimate,
    "Std. Error" = std.error,
    "P-value" = p.value,
    "CI Lower" = conf.low,
    "CI Upper" = conf.high,
    "Exp(Estimate)" = exp.estimate,
    "Exp(CI Lower)" = exp.conf.low,
    "Exp(CI Upper)" = exp.conf.high
  )

# Print the model coefficients, p-values, and exponentiated estimates with confidence intervals
print(tidy_full_model)
## # A tibble: 7 × 9
##   Term     Estimate `Std. Error` `P-value` `CI Lower` `CI Upper` `Exp(Estimate)`
##   <chr>       <dbl>        <dbl>     <dbl>      <dbl>      <dbl>           <dbl>
## 1 (Interc…     5.02         0.24     0           4.56       5.48          152.  
## 2 Day         -0.2          0.03     0          -0.25      -0.14            0.82
## 3 Semiqtl…    -0.18         0.31     0.559      -0.78       0.42            0.84
## 4 Semiqtl…     0.34         0.28     0.223      -0.2        0.88            1.4 
## 5 Semiqtl…    -0.37         0.45     0.416      -1.25       0.52            0.69
## 6 Semiqtl…    -0.1          0.39     0.805      -0.87       0.67            0.91
## 7 diabeti…     1.28         0.81     0.112      -0.3        2.86            3.6 
## # ℹ 2 more variables: `Exp(CI Lower)` <dbl>, `Exp(CI Upper)` <dbl>

Time X Site interaction with no covariates

# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + site + (1 | site) + (Day | ID), 
                                                  family = poisson, 
                                                  data = df)

summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
##  Family: poisson  ( log )
## Formula:          cough_Day ~ Day + site + (1 | site) + (Day | ID)
## Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##    71213    71250   -35600    71199     1393 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  site   (Intercept) 1.00e-07 0.000316      
##  ID     (Intercept) 1.28e+00 1.129272      
##         Day         8.35e-02 0.289020 0.26 
## Number of obs: 1400, groups:  site, 2; ID, 100
## 
## Conditional model:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   5.2999     0.1575    33.7  < 2e-16 ***
## Day          -0.1965     0.0296    -6.6  3.1e-11 ***
## siteMulago   -0.4773     0.2209    -2.2    0.031 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Time X Site interaction with covariates

# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day + Day*site + sex*Day + diabeticstatus*Day + BMI*Day + (1 | site) + (Day | ID), 
                                                  family = poisson, 
                                                  data = df)

summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
##  Family: poisson  ( log )
## Formula:          
## cough_Day ~ Day + Day * site + sex * Day + diabeticstatus * Day +  
##     BMI * Day + (1 | site) + (Day | ID)
## Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##    71211    71284   -35591    71183     1386 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  site   (Intercept) 5.87e-08 0.000242      
##  ID     (Intercept) 1.18e+00 1.084355      
##         Day         7.41e-02 0.272180 0.31 
## Number of obs: 1400, groups:  site, 2; ID, 100
## 
## Conditional model:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            3.74615    0.69776    5.37  7.9e-08 ***
## Day                   -0.15397    0.17734   -0.87    0.385    
## siteMulago            -0.36753    0.22177   -1.66    0.097 .  
## sexMale                0.66506    0.23669    2.81    0.005 ** 
## diabeticstatusYes      0.85576    0.79056    1.08    0.279    
## BMI                    0.05348    0.03019    1.77    0.076 .  
## Day:siteMulago         0.03265    0.05627    0.58    0.562    
## Day:sexMale            0.02387    0.06008    0.40    0.691    
## Day:diabeticstatusYes -0.38670    0.19910   -1.94    0.052 .  
## Day:BMI               -0.00293    0.00763   -0.38    0.701    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Remove site interaction

# Fit the GLMM with Poisson distribution and log link, Day as continuous
poisson_random_coefficients_model_cont <- glmmTMB(cough_Day ~ Day +  sex*Day + diabeticstatus*Day + BMI*Day + (1 | site) + (Day | ID), 
                                                  family = poisson, 
                                                  data = df)

summary_cont <- summary(poisson_random_coefficients_model_cont)
print(summary_cont)
##  Family: poisson  ( log )
## Formula:          
## cough_Day ~ Day + sex * Day + diabeticstatus * Day + BMI * Day +  
##     (1 | site) + (Day | ID)
## Data: df
## 
##      AIC      BIC   logLik deviance df.resid 
##    71210    71273   -35593    71186     1388 
## 
## Random effects:
## 
## Conditional model:
##  Groups Name        Variance Std.Dev. Corr 
##  site   (Intercept) 0.0167   0.129         
##  ID     (Intercept) 1.2026   1.097         
##         Day         0.0729   0.270    0.30 
## Number of obs: 1400, groups:  site, 2; ID, 100
## 
## Conditional model:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            3.58205    0.70749    5.06  4.1e-07 ***
## Day                   -0.13779    0.17442   -0.79   0.4295    
## sexMale                0.67903    0.23960    2.83   0.0046 ** 
## diabeticstatusYes      0.96181    0.80413    1.20   0.2317    
## BMI                    0.05207    0.03055    1.70   0.0883 .  
## Day:sexMale            0.02091    0.05947    0.35   0.7251    
## Day:diabeticstatusYes -0.40451    0.19521   -2.07   0.0383 *  
## Day:BMI               -0.00280    0.00756   -0.37   0.7115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1