Final Assignment

#Part 1

library(readxl)
library(dplyr)

## 
## Dołączanie pakietu: 'dplyr'

## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag

## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(car)

## Ładowanie wymaganego pakietu: carData

## 
## Dołączanie pakietu: 'car'

## Następujący obiekt został zakryty z 'package:dplyr':
## 
##     recode

library(rstatix)

## 
## Dołączanie pakietu: 'rstatix'

## Następujący obiekt został zakryty z 'package:stats':
## 
##     filter

library(ggpubr)
library(FSA)

## Registered S3 methods overwritten by 'FSA':
##   method       from
##   confint.boot car 
##   hist.boot    car

## ## FSA v0.10.1. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.

## 
## Dołączanie pakietu: 'FSA'

## Następujący obiekt został zakryty z 'package:car':
## 
##     bootCase

#INSERT WORKING DIRECTORY HERE!!!!!!!!!!!!
diabetes_data <- read.csv("diabetes_controll.csv")
#Step 1 - loading data
str(diabetes_data)

## 'data.frame':    200 obs. of  9 variables:
##  $ patient_id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age            : int  63 35 58 60 54 53 51 56 48 65 ...
##  $ sex            : chr  "Female" "Female" "Female" "Male" ...
##  $ intervention   : chr  "Standard_Care" "Standard_Care" "Standard_Care" "Standard_Care" ...
##  $ baseline_hba1c : num  7.2 7.4 9.9 7.5 8.9 6.6 9.6 8.6 6.6 7.1 ...
##  $ followup_hba1c : num  8.1 7.2 8.5 8.3 7.1 7.7 7.5 7.6 9.1 6.8 ...
##  $ fasting_glucose: int  176 145 102 175 166 199 120 154 174 199 ...
##  $ exercise_freq  : chr  "3+_per_week" "None" "3+_per_week" "1-2_per_week" ...
##  $ hba1c_change   : num  -0.9 0.2 1.4 -0.8 1.8 ...

head(diabetes_data)

##   patient_id age    sex  intervention baseline_hba1c followup_hba1c
## 1          1  63 Female Standard_Care            7.2            8.1
## 2          2  35 Female Standard_Care            7.4            7.2
## 3          3  58 Female Standard_Care            9.9            8.5
## 4          4  60   Male Standard_Care            7.5            8.3
## 5          5  54 Female Standard_Care            8.9            7.1
## 6          6  53 Female Standard_Care            6.6            7.7
##   fasting_glucose exercise_freq hba1c_change
## 1             176   3+_per_week         -0.9
## 2             145          None          0.2
## 3             102   3+_per_week          1.4
## 4             175  1-2_per_week         -0.8
## 5             166          None          1.8
## 6             199          None         -1.1

#Step 2 - descriptive stats
baseline_summary <- diabetes_data |>
  group_by(sex,intervention,exercise_freq) |>
  summarise(
    n = n(),
    Mean = mean(baseline_hba1c),
    SD = sd(baseline_hba1c),
    Median = median(baseline_hba1c),
    Q1 = quantile(baseline_hba1c,0.25),
    Q3 = quantile(baseline_hba1c,0.75)
  )

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by sex, intervention, and exercise_freq.
## ℹ Output is grouped by sex and intervention.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(sex, intervention, exercise_freq))` for per-operation
##   grouping (`?dplyr::dplyr_by`) instead.

print(baseline_summary)

## # A tibble: 24 × 9
## # Groups:   sex, intervention [8]
##    sex    intervention       exercise_freq     n  Mean    SD Median    Q1    Q3
##    <chr>  <chr>              <chr>         <int> <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1 Female Diet_Counseling    1-2_per_week      5  9.54 1.69   10     9.8  10.6 
##  2 Female Diet_Counseling    3+_per_week      10  7.91 0.877   7.95  7.52  8.28
##  3 Female Diet_Counseling    None             10  9.18 0.666   9.25  8.93  9.4 
##  4 Female Diet_Plus_Exercise 1-2_per_week      9  8.59 1.08    8.8   8.3   9.1 
##  5 Female Diet_Plus_Exercise 3+_per_week       8  8.12 1.65    7.85  6.88  9.65
##  6 Female Diet_Plus_Exercise None              7  9.24 1.21    8.8   8.45 10.2 
##  7 Female Intensive_Program  1-2_per_week     10  7.54 1.42    7.4   7.1   7.9 
##  8 Female Intensive_Program  3+_per_week       6  8.95 0.787   8.7   8.33  9.68
##  9 Female Intensive_Program  None              5  8.52 1.25    8.6   7.9   8.7 
## 10 Female Standard_Care      1-2_per_week      7  8.81 1.05    9.3   8.1   9.55
## # ℹ 14 more rows

follow_summary <- diabetes_data |>
  group_by(sex,intervention,exercise_freq) |>
  summarise(
    n = n(),
    Mean = mean(followup_hba1c),
    SD = sd(followup_hba1c),
    Median = median(followup_hba1c),
    Q1 = quantile(followup_hba1c,0.25),
    Q3 = quantile(followup_hba1c,0.75)
  )

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by sex, intervention, and exercise_freq.
## ℹ Output is grouped by sex and intervention.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(sex, intervention, exercise_freq))` for per-operation
##   grouping (`?dplyr::dplyr_by`) instead.

print(follow_summary)

## # A tibble: 24 × 9
## # Groups:   sex, intervention [8]
##    sex    intervention       exercise_freq     n  Mean    SD Median    Q1    Q3
##    <chr>  <chr>              <chr>         <int> <dbl> <dbl>  <dbl> <dbl> <dbl>
##  1 Female Diet_Counseling    1-2_per_week      5  8.54 1.26    9.2   7.3   9.2 
##  2 Female Diet_Counseling    3+_per_week      10  7.18 0.857   7.15  6.6   7.62
##  3 Female Diet_Counseling    None             10  7.21 0.640   7.3   6.93  7.68
##  4 Female Diet_Plus_Exercise 1-2_per_week      9  6.97 0.876   6.8   6.5   7.7 
##  5 Female Diet_Plus_Exercise 3+_per_week       8  7.04 0.894   6.95  6.48  7.5 
##  6 Female Diet_Plus_Exercise None              7  6.71 0.867   6.3   6.15  7.1 
##  7 Female Intensive_Program  1-2_per_week     10  7.07 0.958   7     6.67  7.75
##  8 Female Intensive_Program  3+_per_week       6  6.57 0.532   6.45  6.15  7.05
##  9 Female Intensive_Program  None              5  6.56 0.835   6.1   5.9   7.2 
## 10 Female Standard_Care      1-2_per_week      7  8.44 0.627   8.5   8     8.85
## # ℹ 14 more rows

change_summary <- diabetes_data |>
  group_by(sex,intervention,exercise_freq) |>
  summarise(
    n = n(),
    Mean = mean(hba1c_change),
    SD = sd(hba1c_change),
    Median = median(hba1c_change),
    Q1 = quantile(hba1c_change,0.25),
    Q3 = quantile(hba1c_change,0.75)
  )

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by sex, intervention, and exercise_freq.
## ℹ Output is grouped by sex and intervention.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(sex, intervention, exercise_freq))` for per-operation
##   grouping (`?dplyr::dplyr_by`) instead.

print(change_summary)

## # A tibble: 24 × 9
## # Groups:   sex, intervention [8]
##    sex    intervention      exercise_freq     n  Mean    SD Median      Q1    Q3
##    <chr>  <chr>             <chr>         <int> <dbl> <dbl>  <dbl>   <dbl> <dbl>
##  1 Female Diet_Counseling   1-2_per_week      5 1     2.48   1.5    1.4     2.7 
##  2 Female Diet_Counseling   3+_per_week      10 0.73  1.16   0.75   0.0250  1.5 
##  3 Female Diet_Counseling   None             10 1.97  0.994  2      1.45    2.25
##  4 Female Diet_Plus_Exerci… 1-2_per_week      9 1.62  1.24   2.1    1.4     2.3 
##  5 Female Diet_Plus_Exerci… 3+_per_week       8 1.09  1.93   1.6   -0.300   2.42
##  6 Female Diet_Plus_Exerci… None              7 2.53  1.13   2.4    1.95    2.55
##  7 Female Intensive_Program 1-2_per_week     10 0.47  1.46   0.200 -0.5     0.95
##  8 Female Intensive_Program 3+_per_week       6 2.38  0.382  2.5    2.15    2.7 
##  9 Female Intensive_Program None              5 1.96  1.89   2      1.4     2.6 
## 10 Female Standard_Care     1-2_per_week      7 0.371 1.19   0     -0.150   0.95
## # ℹ 14 more rows

diabetes_data <- diabetes_data |>
  mutate(hba1c_increase = ifelse(hba1c_change > 0,"YES","NO"),
         glucose_healthy = ifelse(fasting_glucose < 70,"Low",
                                  ifelse(fasting_glucose < 100,"Normal","High"))
  )
diabetes_data |> count(hba1c_increase,glucose_healthy)

##   hba1c_increase glucose_healthy   n
## 1             NO            High  43
## 2             NO          Normal   1
## 3            YES            High 149
## 4            YES          Normal   7

#Part 2

boxplot_baseline <- ggplot(diabetes_data,aes(x = sex,y = baseline_hba1c,fill = sex))+
  geom_boxplot(alpha = 0.7,outlier.shape = 16,outlier.size = 2)+
  geom_jitter(width = 0.1,alpha = 0.3,size = 1.5)+
  scale_fill_manual(values = c("Male" = "#d41b0b",
                               "Female" = "#0b51d4"))+
  labs(
    title = "Baseline Hemoglobin Level by Sex",
    x = "Sex",
    y = "Baseline Hemoglobin",
    caption = "Boxplot with data overlaid"
  )+
  theme_classic()+
  theme(
    plot.title = element_text(size = 14,face = "bold",hjust = 0.5),
    axis.title = element_text(size = 11,face = "bold"),
    plot.caption = element_text(size = 10,color = "gray40"),
    legend.position = "none"
  )
print(boxplot_baseline)

hist_change <- ggplot(diabetes_data,aes(x = hba1c_change))+
  geom_histogram(
    binwidth = 0.5,
    color = "#a11212",fill = "#db516a"
  )+
  labs(
    title = "Distribution of Change in Hemoglobin",
    x = "Change",
    y = "Count"
  )+
  theme_classic()+
  theme(
    plot.title = element_text(size = 14,face = "bold",hjust = 0.5),
    axis.title = element_text(size = 11,face = "bold",color = "gray30"),
    legend.position = "none"
  )
print(hist_change)

bar_data <- diabetes_data |>
  group_by(hba1c_increase) |>
  summarise(
    n = n(),
    Mean = mean(hba1c_change)
  )
barplot_change <- ggplot(bar_data,aes(x = hba1c_increase,y = Mean,fill = hba1c_increase))+
  geom_bar(stat = "identity")+
  scale_fill_manual(values = c("YES" = "#26910c",
                               "NO" = "#910c1d"))+
  labs(
    title = "Mean Change Hemoglobin by whether it increases or not",
    subtitle = "Comparison by Increasing/Decreasing groups",
    x = "Group",
    y = "Mean hemoglobin change"
  )+
  theme_classic()+
  theme(
    plot.title = element_text(size = 14,face = "bold",hjust = 0.5),
    plot.subtitle = element_text(size = 11,color = "gray50",hjust = 0.5),
    axis.title = element_text(size = 11,face = "bold"),
    legend.position = "none"
  )
print(barplot_change)

#Part 3:
Comparing hemoglobin change in different exercise groups
Question : “Does exercise influence hemoglobin levels significantly?

cat("\n--- Independence check ---\n")

## 
## --- Independence check ---

cat("Independent groups\n")

## Independent groups

cat("\n--- Normal Distribution Check ---\n")

## 
## --- Normal Distribution Check ---

cat("\nPerforming shapiro - wilk test:\n")

## 
## Performing shapiro - wilk test:

shapiro_1 <- shapiro.test(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "None"])
shapiro_2 <- shapiro.test(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "1-2_per_week"])
shapiro_3 <- shapiro.test(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "3+_per_week"])
if(shapiro_1$p.value > 0.05 & shapiro_2$p.value > 0.05 & shapiro_3$p.value > 0.05){
  cat("\nNormality satisfied (p > 0.05)\n")
}else{
  cat("\nNormality violated (p < 0.05)\n")
  cat("Perform a test for non-parametric\n")
}

## 
## Normality satisfied (p > 0.05)

cat("\nQ-Q Plot test:\n")

## 
## Q-Q Plot test:

par(mfrow = c(1,3))
qqnorm(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "None"],
       main = "Q-Q Plot: No exc")
qqline(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "None"],
       col = "red",lwd = 2)
qqnorm(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "1-2_per_week"],
       main = "Q-Q Plot: 1-2 times exc")
qqline(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "1-2_per_week"],
       col = "red",lwd = 2)
qqnorm(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "3+_per_week"],
       main = "Q-Q Plot: 3+ times exc")
qqline(diabetes_data$hba1c_change[diabetes_data$exercise_freq == "3+_per_week"],
       col = "red",lwd = 2)

par(mfrow = c(1,1))
cat("\n--- Equal variance in groups check ---\n")

## 
## --- Equal variance in groups check ---

cat("\nLevene test:\n")

## 
## Levene test:

levene_change <- leveneTest(hba1c_change ~ exercise_freq,data = diabetes_data)

## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.

if(levene_change$`Pr(>F)`[1] > 0.05){
  cat("\nEqual variance\n")
  cat("Can perform ANOVA\n")
}else{
  cat("\nNot equal variance\n")
}

## 
## Equal variance
## Can perform ANOVA

cat("\nANOVA test:\n")

## 
## ANOVA test:

anova_test <- aov(hba1c_change ~ exercise_freq,data = diabetes_data)
anova_summary <- summary(anova_test)
f_statistic <- anova_summary[[1]]$`F value`[1]
p_value_anova <- anova_summary[[1]]$`Pr(>F)`[1]
if(p_value_anova < 0.05){
  cat("\n(p < 0.05) - reject H₀\n")
  cat("Significantly different change detected\n")
  cat("Perform POST-HOC next\n")
}else{
  cat("\n(p > 0.05) - do not reject H₀\n")
  cat("No significant difference\n")
}

## 
## (p > 0.05) - do not reject H₀
## No significant difference

cat("\nChecking data significance:")

## 
## Checking data significance:

ss_sup <- anova_summary[[1]]$`Sum Sq`[1]
ss_total <- sum(anova_summary[[1]]$`Sum Sq`)
eta_squared <- ss_sup / ss_total
cat("Effect size: ",round(eta_squared,3),"\n")

## Effect size:  0.018

cat("Interpretation:",
    ifelse(eta_squared < 0.01,"Negligible",
           ifelse(eta_squared < 0.06,"Small",
                  ifelse(eta_squared < 0.14,"Medium","Large"))),"\n\n")

## Interpretation: Small

#Hypothesis(all means are equal) not rejected
stat_test <- diabetes_data |>
  t_test(hba1c_change ~ exercise_freq) |>
  adjust_pvalue(method = "bonferroni") |>
  add_significance("p.adj")
pub_anova_plot <- ggplot(diabetes_data,
                         aes(x = exercise_freq,y = hba1c_change,fill = exercise_freq))+
  geom_boxplot(alpha = 0.7,outlier.shape = 16,width = 0.6)+
  geom_jitter(width = 0.15,alpha = 0.3,size = 1.5)+
  stat_summary(fun = mean,geom = "point",shape = 23,size = 3,
               fill = "white",color = "black",stroke = 1.5)+
  scale_fill_manual(values = c("None" = "#cc1f0c",
"1-2_per_week" = "#18990c",
"3+_per_week" = "#130c99"))+
  labs(
    title = "Hemoglobin Change by Exercise",
    subtitle = sprintf("One-way ANOVA: F(%d,%d) = %.2f, p %s",
                       anova_summary[[1]]$Df[1],
                       anova_summary[[1]]$Df[2],
                       f_statistic,
                       ifelse(p_value_anova < 0.001,"< 0.001",
                              paste("=",round(p_value_anova,3)))),
    x = "Exericse",
    y = "Hemoglobin change",
    caption = "white diamond = mean; boxes = median and IQR"
  )+
  theme_classic()+
  theme(
    plot.title = element_text(size = 14,face = "bold",hjust = 0.5),
    plot.subtitle = element_text(size = 11,color = "gray30",hjust = 0.5),
    axis.title = element_text(size = 11,face = "bold"),
    plot.caption = element_text(size = 10,color = "gray40",hjust = 0.5),
    legend.position = "none",
    panel.grid.major.y = element_line(color = "gray90", linewidth = 0.3)
  )
print(pub_anova_plot)

#Part 4
Null Hypothesis not rejected - means are similar across groups
Low eta squared - low influence of hemoglobin change by exercise groups
Limitations of the study: size of the groups not very big - could differ with population

Final Assignment

Wiktor Zygarowicz

2026-02-15