library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openxlsx)
library(dplyr)

# Load dataset
Probiotic_df <- read.csv("probiotic_data.csv")

# Remove patients who did not complete the study
Probiotic_df_clean <- Probiotic_df %>% na.omit()

# Count total participants
total_recruited <- nrow(Probiotic_df)
print("Total patients recruited:")
## [1] "Total patients recruited:"
print(total_recruited)
## [1] 55
# Count by group
group_counts <- table(Probiotic_df$Group)
print("Group counts:")
## [1] "Group counts:"
print(group_counts)
## 
##    Placebo Probiotics 
##         28         27
# Count completed participants (those with WGTT_POST data)
completed <- Probiotic_df[!is.na(Probiotic_df$WGTT_POST), ]
completed_counts <- table(completed$Group)
print("Completed the study:")
## [1] "Completed the study:"
print(completed_counts)
## 
##    Placebo Probiotics 
##         26         22
# Count the total participant that completed the experiment 
print("Total completed:")
## [1] "Total completed:"
print(sum(completed_counts))
## [1] 48
# Split groups into placebo group and probiotic (intervention) group
placebo_df <- Probiotic_df_clean %>% filter(Group == "Placebo")
probiotic_df <- Probiotic_df_clean %>% filter(Group == "Probiotics")

# Compute summary statistics function (disease duration, % of males, % chinese vs malay, % sedentary, % on levodopa, % on dopamine, 
#.   % on low fiber diet, and % with disease stage <=3)
compute_summary <- function(df, group_name) {
  tibble(
    Group = group_name,
    Duration = paste0(round(mean(df$Duration, na.rm = TRUE), 2), " ± ", round(sd(df$Duration, na.rm = TRUE), 2)),
    Male = paste0(round(mean(df$Sex == "Male", na.rm = TRUE) * 100, 2), "%"),
    Chinese = paste0(round(mean(df$Race == "Chinese", na.rm = TRUE) * 100, 2), "%"),
    Malay = paste0(round(mean(df$Race == "Malay", na.rm = TRUE) * 100, 2), "%"),
    Indian = paste0(round(mean(df$Race == "Indian", na.rm = TRUE) * 100, 2), "%"),
    Sedentary = paste0(round(mean(df$Sedentary == "Sedentary (physical exercise less than 4 hour per week)", na.rm = TRUE) * 100, 2), "%"),
    Levodopa = paste0(round(mean(df$Levodopa == "yes", na.rm = TRUE) * 100, 2), "%"),
    Dopamine = paste0(round(mean(df$Dopamine == "yes", na.rm = TRUE) * 100, 2), "%"),
    LowFiber = paste0(round(mean(df$Fibre == "low (less than 3 serving perday)", na.rm = TRUE) * 100, 2), "%"),
    PD_stage = paste0(round(mean(df$PD_stage_HY == "<=3", na.rm = TRUE) * 100, 2), "%")
  )
}

# Compute summaries
placebo_summary <- compute_summary(placebo_df, "Placebo")
probiotic_summary <- compute_summary(probiotic_df, "Probiotics")

# Create final table of values and percentages for each demographic/clinical characteristic
final_table <- tibble(
  Characteristic = c("Duration", "Male", "Chinese", "Malay","Indian", "Sedentary", "Levodopa", "Dopamine", "Low Fiber", "PD Stage <=3"),
  Placebo = unlist(placebo_summary[2:11]),
  Probiotics = unlist(probiotic_summary[2:11])
)

# Print the final table
print(final_table)
## # A tibble: 10 × 3
##    Characteristic Placebo     Probiotics
##    <chr>          <chr>       <chr>     
##  1 Duration       8.35 ± 6.38 7.5 ± 4.88
##  2 Male           61.54%      59.09%    
##  3 Chinese        53.85%      36.36%    
##  4 Malay          42.31%      63.64%    
##  5 Indian         3.85%       0%        
##  6 Sedentary      50%         50%       
##  7 Levodopa       88.46%      90.91%    
##  8 Dopamine       57.69%      68.18%    
##  9 Low Fiber      73.08%      86.36%    
## 10 PD Stage <=3   65.38%      63.64%
## Visual assessment of normality using Q-Q plots
# Following the approach from page 5 of your PDF

# Q-Q plot for probiotic group
qqplot_prob <- ggplot(data.frame(sample = probiotic_df$WGTT_DIFF), aes(sample = sample)) +
  stat_qq_line(size = 2, aes(color = 'red')) +
  stat_qq(size = 2) +
  theme_light() +
  ggtitle("Q-Q Plot for Probiotic Group GTT Changes")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Q-Q plot for placebo group
qqplot_plac <- ggplot(data.frame(sample = placebo_df$WGTT_DIFF), aes(sample = sample)) +
  stat_qq_line(size = 2, aes(color = 'red')) +
  stat_qq(size = 2) +
  theme_light() +
  ggtitle("Q-Q Plot for Placebo Group GTT Changes")

# Print the Q-Q plots
print(qqplot_prob)

print(qqplot_plac)

# Histograms to assess normality (as shown in page 4 of your PDF)
hist_prob <- ggplot(data.frame(x = probiotic_df$WGTT_DIFF), aes(x = x)) +
  geom_histogram(binwidth = 20, fill = "deepskyblue", color = "black") +
  theme_light() +
  ggtitle("Histogram of GTT Changes - Probiotic Group")

hist_plac <- ggplot(data.frame(x = placebo_df$WGTT_DIFF), aes(x = x)) +
  geom_histogram(binwidth = 20, fill = "deepskyblue", color = "black") +
  theme_light() +
  ggtitle("Histogram of GTT Changes - Placebo Group")

# Print the histograms
print(hist_prob)

print(hist_plac)

# Function to compute p-values
#First need to assess normality for "Duration" (the only continuous variable) using Shapiro-Wilk test
shapiro_test_duration_placebo <- shapiro.test(placebo_df$Duration)
print(shapiro_test_duration_placebo)
## 
##  Shapiro-Wilk normality test
## 
## data:  placebo_df$Duration
## W = 0.86795, p-value = 0.003237
shapiro_test_duration_probiotic <- shapiro.test(probiotic_df$Duration)
print(shapiro_test_duration_probiotic)
## 
##  Shapiro-Wilk normality test
## 
## data:  probiotic_df$Duration
## W = 0.85502, p-value = 0.004169
#Compute the p-values (use Mann-Whitney U test for continuous variable 'Duration', and use chi-square tests for categorical variables)
compute_p_values <- function(var) {
  if (var == "Race") {
    # Use Fisher's exact test for Race. We use fisher.test() with simulate.p.value=True to allow the Fisher's exact test to work with a table of 3x2
    #The simulate.p.value = TRUE parameter tells R to use Monte Carlo simulation to calculate p-value considering we have a larger table
    contingency_table <- table(Probiotic_df_clean[[var]], Probiotic_df_clean$Group)
    fisher.test(contingency_table, simulate.p.value = TRUE)$p.value
  } else if (is.numeric(Probiotic_df_clean[[var]])) {
    # Use Mann-Whitney U test for continuous variables
    wilcox.test(Probiotic_df_clean[[var]] ~ Probiotic_df_clean$Group)$p.value
  } else {
    # Use chi-square test for other categorical variables
    contingency_table <- table(Probiotic_df_clean[[var]], Probiotic_df_clean$Group)
    chisq.test(contingency_table)$p.value
  }
}

# Compute p-values for each characteristic to determine if baseline differences between placebo and intervention group are significant
p_values <- tibble(
  Characteristic = c("Duration", "Sex", "Race", "Sedentary", "Levodopa", "Dopamine", "Fibre", "PD_stage_HY"),
  P_Value = sapply(c("Duration", "Sex", "Race", "Sedentary", "Levodopa", "Dopamine", "Fibre", "PD_stage_HY"), compute_p_values)
)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
# Print the p-values
print(p_values)
## # A tibble: 8 × 2
##   Characteristic P_Value
##   <chr>            <dbl>
## 1 Duration         0.843
## 2 Sex              1    
## 3 Race             0.233
## 4 Sedentary        1    
## 5 Levodopa         1    
## 6 Dopamine         0.654
## 7 Fibre            0.440
## 8 PD_stage_HY      1.00
# Check normality of gut transit time change using Shapiro-Wilk test
shapiro_test_placebo <- shapiro.test(placebo_df$WGTT_DIFF)
print(shapiro_test_placebo)
## 
##  Shapiro-Wilk normality test
## 
## data:  placebo_df$WGTT_DIFF
## W = 0.78548, p-value = 0.0001007
shapiro_test_probiotic <- shapiro.test(probiotic_df$WGTT_DIFF)
print(shapiro_test_probiotic)
## 
##  Shapiro-Wilk normality test
## 
## data:  probiotic_df$WGTT_DIFF
## W = 0.9059, p-value = 0.03907
# Perform Mann-Whitney U test for difference between groups given data is not normal
mw_test_independent <- wilcox.test(WGTT_DIFF ~ Group, data = Probiotic_df_clean)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
print(mw_test_independent)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  WGTT_DIFF by Group
## W = 186, p-value = 0.03341
## alternative hypothesis: true location shift is not equal to 0
# Perform Wilcoxon Signed-Rank tests within each group given data is not normal
wilcox_paired_placebo <- wilcox.test(placebo_df$WGTT_PRE, placebo_df$WGTT_POST, paired = TRUE)
## Warning in wilcox.test.default(placebo_df$WGTT_PRE, placebo_df$WGTT_POST, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(placebo_df$WGTT_PRE, placebo_df$WGTT_POST, :
## cannot compute exact p-value with zeroes
wilcox_paired_probiotic <- wilcox.test(probiotic_df$WGTT_PRE, probiotic_df$WGTT_POST, paired = TRUE)
## Warning in wilcox.test.default(probiotic_df$WGTT_PRE, probiotic_df$WGTT_POST, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(probiotic_df$WGTT_PRE, probiotic_df$WGTT_POST, :
## cannot compute exact p-value with zeroes
print(wilcox_paired_placebo)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  placebo_df$WGTT_PRE and placebo_df$WGTT_POST
## V = 61, p-value = 0.09118
## alternative hypothesis: true location shift is not equal to 0
print(wilcox_paired_probiotic)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  probiotic_df$WGTT_PRE and probiotic_df$WGTT_POST
## V = 145, p-value = 0.001282
## alternative hypothesis: true location shift is not equal to 0
###########Test
#Calculate descriptive statistics for both groups, before and after
placebo_pre_summary <- data.frame(
  Group = "Placebo",
  Time = "Pre",
  N = length(placebo_df$WGTT_PRE),
  Mean = mean(placebo_df$WGTT_PRE, na.rm = TRUE),
  Median = median(placebo_df$WGTT_PRE, na.rm = TRUE),
  SD = sd(placebo_df$WGTT_PRE, na.rm = TRUE)
)

placebo_post_summary <- data.frame(
  Group = "Placebo",
  Time = "Post",
  N = length(placebo_df$WGTT_POST),
  Mean = mean(placebo_df$WGTT_POST, na.rm = TRUE),
  Median = median(placebo_df$WGTT_POST, na.rm = TRUE),
  SD = sd(placebo_df$WGTT_POST, na.rm = TRUE)
)

probiotic_pre_summary <- data.frame(
  Group = "Probiotic",
  Time = "Pre",
  N = length(probiotic_df$WGTT_PRE),
  Mean = mean(probiotic_df$WGTT_PRE, na.rm = TRUE),
  Median = median(probiotic_df$WGTT_PRE, na.rm = TRUE),
  SD = sd(probiotic_df$WGTT_PRE, na.rm = TRUE)
)

probiotic_post_summary <- data.frame(
  Group = "Probiotic",
  Time = "Post",
  N = length(probiotic_df$WGTT_POST),
  Mean = mean(probiotic_df$WGTT_POST, na.rm = TRUE),
  Median = median(probiotic_df$WGTT_POST, na.rm = TRUE),
  SD = sd(probiotic_df$WGTT_POST, na.rm = TRUE)
)

# Combine all summaries
all_summaries <- rbind(placebo_pre_summary, placebo_post_summary, 
                       probiotic_pre_summary, probiotic_post_summary)

print(all_summaries)
##       Group Time  N      Mean Median       SD
## 1   Placebo  Pre 26 134.26923  168.0 51.19262
## 2   Placebo Post 26 113.53846  146.5 61.53908
## 3 Probiotic  Pre 22 135.36364  168.0 53.74946
## 4 Probiotic Post 22  77.31818   70.5 55.34514
#Plot a box plot of placebo vs intervention group's change in transit time to visualize differences
ggplot(Probiotic_df_clean, aes(x = Group, y = WGTT_DIFF, fill = Group)) +
  geom_boxplot(outlier.shape = NA) + # Avoids outlier clutter
  geom_jitter(width = 0.2, alpha = 0.5) + # Shows individual data points
  theme_minimal() +
  labs(title = "Change in Gut Transit Time", x = "Group", y = "Change in Transit Time (hours)") +
  scale_fill_manual(values = c("Placebo" = "gray", "Probiotics" = "blue"))

# Make sure Group is a factor variable
Probiotic_df_clean$Group <- as.factor(Probiotic_df_clean$Group)


# Step 3: Visualize the data
ggplot(Probiotic_df_clean, aes(x = Group, y = WGTT_DIFF)) +
  geom_boxplot(fill = "lightblue") +
  theme_light() +
  labs(title = "Gut Transit Time Change by Treatment Group",
       y = "Change in Gut Transit Time (hours)",
       x = "Treatment Group")

# Step 4: Build a simple linear regression model (unadjusted)
#Simple descriptive statistics by treatment group
Probiotic_df_clean %>%
  group_by(Group) %>%
  summarise(
    n = n(),mean_WGTT_DIFF = mean(WGTT_DIFF, na.rm = TRUE),
    median_WGTT_DIFF = median(WGTT_DIFF, na.rm = TRUE),
    sd_WGTT_DIFF = sd(WGTT_DIFF, na.rm = TRUE)
  )
## # A tibble: 2 × 5
##   Group          n mean_WGTT_DIFF median_WGTT_DIFF sd_WGTT_DIFF
##   <fct>      <int>          <dbl>            <dbl>        <dbl>
## 1 Placebo       26           20.7                0         60.5
## 2 Probiotics    22           58.0               73         59.3
base_model <- lm(WGTT_DIFF ~ Group, data = Probiotic_df_clean) #means
summary(base_model)
## 
## Call:
## lm(formula = WGTT_DIFF ~ Group, data = Probiotic_df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -164.73  -27.05  -20.73   38.70  135.27 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        20.73      11.76   1.763   0.0845 .
## GroupProbiotics    37.31      17.36   2.149   0.0369 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.94 on 46 degrees of freedom
## Multiple R-squared:  0.09123,    Adjusted R-squared:  0.07148 
## F-statistic: 4.618 on 1 and 46 DF,  p-value: 0.03694
confint(base_model)
##                     2.5 %   97.5 %
## (Intercept)     -2.932161 44.39370
## GroupProbiotics  2.362220 72.26715
par(mfrow = c(2, 2))
plot(base_model)

par(mfrow = c(1, 1))

full_model <- lm(WGTT_DIFF ~ Group + WGTT_PRE + Sedentary, data = Probiotic_df_clean)
summary(full_model)
## 
## Call:
## lm(formula = WGTT_DIFF ~ Group + WGTT_PRE + Sedentary, data = Probiotic_df_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.279 -32.794  -0.603  38.661 100.463 
## 
## Coefficients:
##                                                                  Estimate
## (Intercept)                                                      -32.3014
## GroupProbiotics                                                   36.7425
## WGTT_PRE                                                           0.5228
## SedentarySedentary (physical exercise less than 4 hour per week) -34.3399
##                                                                  Std. Error
## (Intercept)                                                         23.1612
## GroupProbiotics                                                     14.8982
## WGTT_PRE                                                             0.1448
## SedentarySedentary (physical exercise less than 4 hour per week)    14.8459
##                                                                  t value
## (Intercept)                                                       -1.395
## GroupProbiotics                                                    2.466
## WGTT_PRE                                                           3.611
## SedentarySedentary (physical exercise less than 4 hour per week)  -2.313
##                                                                  Pr(>|t|)    
## (Intercept)                                                      0.170127    
## GroupProbiotics                                                  0.017622 *  
## WGTT_PRE                                                         0.000776 ***
## SedentarySedentary (physical exercise less than 4 hour per week) 0.025453 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51.43 on 44 degrees of freedom
## Multiple R-squared:  0.3602, Adjusted R-squared:  0.3166 
## F-statistic: 8.257 on 3 and 44 DF,  p-value: 0.0001813
confint(full_model)
##                                                                        2.5 %
## (Intercept)                                                      -78.9797241
## GroupProbiotics                                                    6.7170300
## WGTT_PRE                                                           0.2310736
## SedentarySedentary (physical exercise less than 4 hour per week) -64.2597644
##                                                                      97.5 %
## (Intercept)                                                      14.3768504
## GroupProbiotics                                                  66.7679292
## WGTT_PRE                                                          0.8146187
## SedentarySedentary (physical exercise less than 4 hour per week) -4.4200113