Data analysis

Author

Kai Gavriel

Published

Sunday 12 October 2025

Several required packages are loaded here, to be used later.

library(readr)
library(tidyverse)
library(ggplot2)
library(ggridges)
library(ggthemes)
library(car)
library(dplyr)
library(afex)       # for aov_car (ANOVA with corrections)
library(lmerTest)   # for LMM with p-values
library(effectsize) # for η² etc.
library(emmeans)
library(gt)
library(stringr)
library(kableExtra)

Preliminary

Firstly, the data were imported and stored as a data frame (df), and converted to a more workable format.

Code
df <- read_csv("2025-W41-6_BMS1052_A3_Data.csv")

exclude_carryover <- FALSE # Whether to remove the Carryover test data

df_long <- df %>%
  pivot_longer(
    cols = c(`Pre-test`, `Mid-test`, `Post-test`, `Carryover`),
    names_to = "Test",
    values_to = "Error"
  ) %>%
  { if (exclude_carryover) filter(., Test != "Carryover") else . }

Removing outliers

Code
df_flag <- df_long %>%
  group_by(Training, Test) %>%
  mutate(
    Q1 = quantile(Error, 0.25, na.rm = TRUE),
    Q3 = quantile(Error, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    is_outlier = Error < (Q1 - 1.5 * IQR) | Error > (Q3 + 1.5 * IQR)
  ) %>%
  ungroup()
outliers_list <- df_flag %>%
  filter(is_outlier) %>%
  arrange(Training, Test, Participant)

outliers_summary <- outliers_list %>%
  summarise(
    total_outliers = n(),
    total_rows = nrow(df_long),
    perc = 100 * total_outliers / total_rows
  )

df_flag <- df_flag %>%
  mutate(Test = factor(Test,
                       levels = c("Carryover", "Post-test", "Mid-test", "Pre-test")))

df_clean <- df_flag %>% filter(!is_outlier)
df_out   <- df_flag %>% filter(is_outlier)

point_nudge <- 0.06

ggplot() +
  stat_density_ridges(
    data = df_clean,
    aes(x = Error, y = Test),
    fill = "grey85",
    colour = "grey40",
    alpha = 0.7,
    scale = 0.95,         
    rel_min_height = 0.01,
    size = 0.3
  ) +
  geom_point(
    data = df_out,
    aes(x = Error, y = Test),
    shape = 21,           
    fill  = "red",
    colour = "white",    
    stroke = 0.6,
    size = 2.3,
    position = position_nudge(y = point_nudge) 
  ) +
  facet_wrap(~ Training) +
  theme_clean() +
  theme(
    panel.grid.minor = element_blank()
  ) +
  labs(
    title = "Ridgeline distributions of the data collected",
    subtitle = "Outliers shown as red points",
    x = "Absolute error (cm)",
    y = "Test phase"
  )
Figure 1: Distrubutions of each testing group by phase, with outliers highlighted

Before performing analyses on the data, they were checked for outliers. A total of 12 outliers were found within the data, comprising 3.61 per cent thereof. These were subsequently removed, and are shown in Figure 1.

Normality

Many common statistical analysis models rely on the assumption that the data approximate a normal distribution. To assess whether each group fulfilled this criterion, a Q–Q plot was constructed for each combination of training group and testing phase, as shown in Figure 2.

Code
df_flag <- df_flag %>%
  mutate(Test = factor(Test,
                       levels = c("Pre-test", "Mid-test", "Post-test", "Carryover")))

ggplot(df_flag %>% filter(!is_outlier),
       aes(sample = Error)) +
  stat_qq(size = 1, colour = "grey30") +
  stat_qq_line(colour = "red", linewidth = 0.6) +
  facet_grid(Training ~ Test, switch = "both") +
  theme_clean() +
  labs(
    title = "Q–Q plots for normality by Training × Test",
    x = "Theoretical quantiles (normal distribution)",
    y = "Observed quantiles of error"
  ) +
  theme(
    strip.placement = "outside",
    strip.background = element_blank(),
    panel.grid.minor = element_blank()
  )
Figure 2: Quantile–quantile (Q–Q) plots for each training group and test phase, describing the normality of each against a theoretical normal distribution

It is evident that the data mostly fulfil this criterion, as the data points approximate the normal line (red). Of course, given the moderately large size of the data (\(n>20\) per group), small deviations are unlikely to affect the data.

Homogenity of variance

Code
df_pre <- df_flag %>%
  filter(Test == "Pre-test", !is.na(Error), !is_outlier) %>%
  mutate(Training = as.factor(Training))

levene_res <- leveneTest(Error ~ Training, data = df_pre, center = median)
levene_p <- levene_res$`Pr(>F)`[1]

To assess the baseline variance, Levene’s test was performed, revealing homogeneous variance between each group: \(F(2, 73)=1.002\), \(p=0.37\).

A violin plot (Figure 3) was also included for the purpose of displaying the variance visually.

Code
df_pre <- df_flag %>%
  filter(Test == "Pre-test", !is_outlier, !is.na(Error)) %>%
  mutate(Training = factor(Training))

ggplot(df_pre, aes(x = Training, y = Error, fill = Training)) +
  geom_violin(trim = FALSE, alpha = 0.7) +
  geom_boxplot(width = 0.1, colour = "black", fill = "white", outlier.shape = NA) +
  theme_clean() +
  labs(
    title = "Baseline (pre-test) error distributions",
    x = "Training group",
    y = "Error"
  ) +
  theme(legend.position = "none")
Figure 3: Error distributions by training group for pre-tests only (outliers removed)

Analyses

Descriptive plots

Adjusting for baseline performance was made possible by taking the difference between later test scores and each participant’s mean pre-test score, as shown in Figure 4.

Code
test_levels <- c("Pre-test", "Mid-test", "Post-test", "Carryover")

# 1) Baselines per participant (average if somehow >1 Pre row), outliers removed
pre_vals <- df_flag %>%
  filter(!is_outlier, Test == "Pre-test") %>%
  group_by(Participant, Training) %>%
  summarise(pre_val = mean(Error, na.rm = TRUE), .groups = "drop")

# 2) Join baselines back, compute change, drop cases with no baseline
df_change <- df_flag %>%
  filter(!is_outlier) %>%
  left_join(pre_vals, by = c("Participant", "Training")) %>%
  mutate(
    Test = factor(Test, levels = test_levels),
    Change = Error - pre_val
  ) %>%
  filter(Test != "Pre-test") %>%          # no change for Pre
  drop_na(pre_val)                        # remove participants lacking a Pre value

# 3) Summarise and plot
df_change_summary <- df_change %>%
  group_by(Training, Test) %>%
  summarise(
    mean_change = mean(Change, na.rm = TRUE),
    se = sd(Change, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

ggplot(df_change_summary,
       aes(x = Test, y = mean_change, group = Training, colour = Training)) +
  geom_ribbon(aes(ymin = mean_change - se, ymax = mean_change + se),
              fill = "grey70", alpha = 0.25, colour = NA) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.6) +
  theme_clean() +
  labs(
    title = "Mean change from baseline (pre-test)",
    x = "Test phase",
    y = "Change in Error (±SE)",
    colour = "Training group"
  )
Figure 4: A plot of mean change from pre-test scores by training group (shaded ribbons show standard error)

Of course, it may be more intuitive to compare both training groups against the control, as shown in Figure 5. Again, this shows that both training groups had consistently smaller changes from baseline (on average) across all test phases.

Code
test_levels <- c("Pre-test", "Mid-test", "Post-test", "Carryover")

# --- Baselines (Pre-test) per participant
pre_vals <- df_flag %>%
  filter(!is_outlier, Test == "Pre-test") %>%
  group_by(Participant, Training) %>%
  summarise(pre_val = mean(Error, na.rm = TRUE), .groups = "drop")

# --- Pooled groups: Training vs Control
df_pooled <- df_flag %>%
  filter(!is_outlier) %>%
  mutate(
    Group2 = case_when(
      Training %in% c("Physical", "Visual") ~ "Training",
      Training %in% c("Control", "No training", "None") ~ "Control",
      TRUE ~ Training
    )
  ) %>%
  left_join(pre_vals, by = c("Participant", "Training")) %>%
  mutate(
    Test = factor(Test, levels = test_levels),
    Change = Error - pre_val
  ) %>%
  drop_na(pre_val) %>%
  filter(Group2 %in% c("Training", "Control"))

# --- Summarise mean ± SE
df_change_summary <- df_pooled %>%
  group_by(Group2, Test) %>%
  summarise(
    mean_change = mean(Change, na.rm = TRUE),
    se = sd(Change, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

# --- Plot: mean ± SE with colour-matched ribbons
ggplot(df_change_summary,
       aes(x = Test, y = mean_change, group = Group2, colour = Group2, fill = Group2)) +
  geom_ribbon(aes(ymin = mean_change - se, ymax = mean_change + se),
              alpha = 0.2, colour = NA) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.6) +
  theme_clean() +
  labs(
    title = "Mean change from baseline (pre-test)",
    x = "Test phase",
    y = "Change in error from baseline",
    colour = "Group",
    fill = "Group"
  )
Figure 5: Mean change from pre-test scores by presence or absence of either training type (ribbons show standard error)

Figure 6 clearly shows that (on average) baseline-adjusted errors remained lower than control in physical group (except for in the carryover tests), and higher than control in the visual group.

Code
test_levels <- c("Pre-test", "Mid-test", "Post-test", "Carryover")

# 1) Get each participant's baseline (after removing outliers)
pre_vals <- df_flag %>%
  filter(!is_outlier, Test == "Pre-test") %>%
  group_by(Participant, Training) %>%
  summarise(pre_val = mean(Error, na.rm = TRUE), .groups = "drop")

# 2) Join baseline, compute normalised score, drop those without a baseline
df_norm <- df_flag %>%
  filter(!is_outlier) %>%
  left_join(pre_vals, by = c("Participant", "Training")) %>%
  mutate(
    Test = factor(Test, levels = test_levels),
    Norm = Error / pre_val
  ) %>%
  drop_na(pre_val)   # ensures Norm is well-defined (no Pre value -> remove)

# 3) Summarise and plot
df_norm_summary <- df_norm %>%
  group_by(Training, Test) %>%
  summarise(
    mean_norm = mean(Norm, na.rm = TRUE),
    se = sd(Norm, na.rm = TRUE) / sqrt(n()),
    .groups = "drop"
  )

ggplot(df_norm_summary,
       aes(x = Test, y = mean_norm, group = Training, colour = Training)) +
  geom_ribbon(aes(ymin = mean_norm - se, ymax = mean_norm + se),
              fill = "grey70", alpha = 0.25, colour = NA) +
  geom_line(linewidth = 1) +
  geom_point(size = 3) +
  geom_hline(yintercept = 1, linetype = "dashed", alpha = 0.6) +
  theme_clean() +
  labs(
    title = "Baseline-normalised error across test phases",
    x = "Test phase",
    y = "Error relative to baseline",
    colour = "Training group"
  )
Figure 6: Distribution of error scores after adjustment for baseline (pre-test score) – ribbons show standard error

ANCOVA

Summary of ANCOVA results

Below is an ANCOVA, which adjusted for baseline variance as a covariate (the ‘C’ in ‘ANCOVA’). Importantly, there was a (somewhat) significant multiplicative interaction between training type and time/test phase (\(F(6, 191)=1.84\), \(p<0.1\)); this is likely the most important, significance level notwithstanding (or assuming \(\alpha\not=0.05\)).

In addition, training type and test phase both showed statistically significant trends separately (\(p<0.05\) and \(p<0.001\) respectively), and so did baseline performance and later performance (\(p<0.001\)). These results show multiple significant trends within the data.

Other things to note:

  • Post-hoc analyses used either Tukey-adjusted \(p\)-values or applied Bonferroni’s correction to ensure results were accurate.
  • Tests for sphericity indicated a need for correction, so the Greenhouse–Geisser method was applied.
  • Interaction between pre- and post-tests of each training type were all significant (\(p<0.05\)).
  • Type III factors used.
Code
# Baseline per participant (for covariate)
pre_vals <- df_flag %>%
  filter(Test == "Pre-test", !is_outlier) %>%
  group_by(Participant, Training) %>%
  summarise(PreError = mean(Error, na.rm = TRUE), .groups = "drop")

# Merge baseline into full dataset
df_anova <- df_flag %>%
  filter(!is_outlier) %>%
  left_join(pre_vals, by = c("Participant", "Training")) %>%
  mutate(
    Training = factor(Training),
    Test = factor(Test, levels = c("Pre-test", "Mid-test", "Post-test", "Carryover")),
    Participant = factor(Participant)
  ) %>%
  drop_na(Error, PreError)
anova_baseline <- aov_car(
  Error ~ PreError + Training * Test + Error(Participant/Test),
  data = df_anova,
  type = 3,    # Type III sums of squares
  factorize = FALSE
)
anova_baseline
Anova Table (Type 3 tests)

Response: Error
         Effect           df   MSE         F  ges p.value
1      PreError        1, 69 25.02 98.89 *** .379   <.001
2      Training        2, 69 25.02    3.51 * .042    .035
3          Test 2.76, 190.78 12.22 16.01 *** .118   <.001
4 PreError:Test 2.76, 190.78 12.22 36.28 *** .232   <.001
5 Training:Test 5.53, 190.78 12.22    1.84 + .030    .099
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '+' 0.1 ' ' 1

Sphericity correction method: GG 

Post-hoc analyses

Between-group comparisons within each test were conducted:

Code
emm_train_by_test <- emmeans(anova_baseline, ~ Training | Test)
pairs(emm_train_by_test, adjust = "tukey")
Test = Pre.test:
 contrast           estimate   SE df t.ratio p.value
 Control - Physical    0.000 0.00 69   0.795  0.7076
 Control - Visual      0.000 0.00 69   0.178  0.9828
 Physical - Visual     0.000 0.00 69  -0.675  0.7786

Test = Mid.test:
 contrast           estimate   SE df t.ratio p.value
 Control - Physical    1.866 1.38 69   1.352  0.3716
 Control - Visual     -0.370 1.37 69  -0.270  0.9608
 Physical - Visual    -2.236 1.26 69  -1.770  0.1872

Test = Post.test:
 contrast           estimate   SE df t.ratio p.value
 Control - Physical    2.960 1.29 69   2.295  0.0632
 Control - Visual      1.191 1.28 69   0.929  0.6240
 Physical - Visual    -1.769 1.18 69  -1.499  0.2978

Test = Carryover:
 contrast           estimate   SE df t.ratio p.value
 Control - Physical    0.807 1.37 69   0.588  0.8268
 Control - Visual     -2.269 1.36 69  -1.664  0.2265
 Physical - Visual    -3.076 1.26 69  -2.450  0.0438

P value adjustment: tukey method for comparing a family of 3 estimates 

As were within-group comparisons:

Code
emm_test_by_train <- emmeans(anova_baseline, ~ Test | Training)
pairs(emm_test_by_train, adjust = "bonferroni")
Training = Control:
 contrast              estimate    SE df t.ratio p.value
 Pre.test - Mid.test      2.072 1.030 69   2.003  0.2949
 Pre.test - Post.test     2.879 0.967 69   2.978  0.0240
 Pre.test - Carryover     2.514 1.030 69   2.445  0.1023
 Mid.test - Post.test     0.807 1.080 69   0.748  1.0000
 Mid.test - Carryover     0.442 1.100 69   0.402  1.0000
 Post.test - Carryover   -0.365 1.270 69  -0.288  1.0000

Training = Physical:
 contrast              estimate    SE df t.ratio p.value
 Pre.test - Mid.test      3.939 0.874 69   4.507  0.0002
 Pre.test - Post.test     5.840 0.817 69   7.151  <.0001
 Pre.test - Carryover     3.321 0.868 69   3.824  0.0017
 Mid.test - Post.test     1.901 0.911 69   2.088  0.2432
 Mid.test - Carryover    -0.618 0.929 69  -0.665  1.0000
 Post.test - Carryover   -2.518 1.070 69  -2.354  0.1285

Training = Visual:
 contrast              estimate    SE df t.ratio p.value
 Pre.test - Mid.test      1.702 0.907 69   1.876  0.3889
 Pre.test - Post.test     4.070 0.848 69   4.801  0.0001
 Pre.test - Carryover     0.245 0.902 69   0.272  1.0000
 Mid.test - Post.test     2.368 0.945 69   2.505  0.0877
 Mid.test - Carryover    -1.457 0.964 69  -1.511  0.8115
 Post.test - Carryover   -3.825 1.110 69  -3.444  0.0059

P value adjustment: bonferroni method for 6 tests 
Code
emm_df <- as.data.frame(emmeans(anova_baseline, ~ Training * Test))

ggplot(emm_df, aes(x = Test, y = emmean, colour = Training, group = Training)) +
  geom_ribbon(aes(ymin = lower.CL, ymax = upper.CL, fill = Training),
              alpha = 0.15, colour = NA) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  theme_clean() +
  labs(
    title = "Interaction between Training type and Test phase (baseline-adjusted)",
    x = "Test phase",
    y = "Estimated marginal mean Error (±95 % CI)",
    colour = "Training group",
    fill = "Training group"
  )

Code
# Helper to standardise test labels from emmeans (dot -> hyphen) and order them
fix_test_labels <- function(x) {
  x %>%
    mutate(Test = str_replace_all(Test, c("Pre\\.test"="Pre-test",
                                          "Mid\\.test"="Mid-test",
                                          "Post\\.test"="Post-test"))) %>%
    mutate(Test = factor(Test, levels = c("Pre-test","Mid-test","Post-test","Carryover")))
}

# 0) Get EMMs from your ANOVA and fix labels
emm_full <- emmeans(anova_baseline, ~ Training * Test) %>% as.data.frame() %>% fix_test_labels()

# -------------------------------
# 1) Control vs Physical
# -------------------------------
emm_CP <- emm_full %>% filter(Training %in% c("Control","Physical"))

ggplot(emm_CP, aes(x = Test, y = emmean, colour = Training, group = Training)) +
  geom_ribbon(aes(ymin = lower.CL, ymax = upper.CL, fill = Training),
              alpha = 0.15, colour = NA) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  theme_clean() +
  labs(
    title = "Baseline-adjusted means: Control vs Physical",
    x = "Test phase",
    y = "Estimated marginal mean Error (±95% CI)",
    colour = "Group", fill = "Group"
  )

Code
# -------------------------------
# 2) Physical vs (Control+Visual pooled)
# -------------------------------

# EMMs by Training | Test (for pooling), then fix labels
emm_T_by_Tr <- emmeans(anova_baseline, ~ Training | Test)
emm_T_by_Tr_df <- as.data.frame(emm_T_by_Tr) %>% fix_test_labels()

# Weighted pooling weights (per-participant counts)
n_tbl <- df_anova %>% distinct(Participant, Training) %>% count(Training)
w_C <- n_tbl$n[n_tbl$Training == "Control"]; if (length(w_C)==0) w_C <- 0
w_V <- n_tbl$n[n_tbl$Training == "Visual"];  if (length(w_V)==0) w_V <- 0
w_sum <- w_C + w_V; if (w_sum == 0) { w_C <- 1; w_V <- 1; w_sum <- 2 }

# Build contrast vector aligned to emm factor order
levs <- levels(emm_T_by_Tr)$Training
coef_vec <- setNames(rep(0, length(levs)), levs)
coef_vec["Control"] <- w_C / w_sum
coef_vec["Visual"]  <- w_V / w_sum

emm_pooled <- contrast(emm_T_by_Tr, method = list("Control+Visual" = as.numeric(coef_vec)),
                       by = "Test", infer = c(TRUE, TRUE)) %>%
  as.data.frame() %>%
  fix_test_labels() %>%
  transmute(Group2 = "Control+Visual", Test, emmean = estimate, lower.CL, upper.CL)

phys_df <- emmeans(anova_baseline, ~ Test | Training) %>%
  as.data.frame() %>%
  fix_test_labels() %>%
  filter(Training == "Physical") %>%
  transmute(Group2 = "Physical", Test, emmean, lower.CL, upper.CL)

emm_two <- bind_rows(emm_pooled, phys_df)

ggplot(emm_two, aes(x = Test, y = emmean, colour = Group2, group = Group2)) +
  geom_ribbon(aes(ymin = lower.CL, ymax = upper.CL, fill = Group2),
              alpha = 0.18, colour = NA) +
  geom_line(linewidth = 1.1) +
  geom_point(size = 3) +
  theme_clean() +
  labs(
    title = "Baseline-adjusted means: Physical vs (Control+Visual pooled)",
    x = "Test phase",
    y = "Estimated marginal mean Error (±95% CI)",
    colour = "Group", fill = "Group"
  )

Paired t-tests

Code
df_prepost <- df_flag %>%
  filter(Test %in% c("Pre-test", "Post-test"), !is_outlier) %>%
  select(Participant, Training, Test, Error) %>%
  pivot_wider(names_from = Test, values_from = Error)

t_results <- df_prepost %>%
  group_by(Training) %>%
  summarise(
    n         = n(),
    t         = t.test(`Pre-test`, `Post-test`, paired = TRUE)$statistic,
    df        = t.test(`Pre-test`, `Post-test`, paired = TRUE)$parameter,
    p_numeric = t.test(`Pre-test`, `Post-test`, paired = TRUE)$p.value,
    mean_pre  = mean(`Pre-test`,  na.rm = TRUE),
    mean_post = mean(`Post-test`, na.rm = TRUE),
    mean_diff = mean(`Pre-test` - `Post-test`, na.rm = TRUE),
    .groups = "drop"
  )

t_results_out <- t_results %>%
  mutate(
    p = case_when(
      p_numeric < 0.001 ~ "< 0.001",
      p_numeric < 0.01  ~ "< 0.01",
      p_numeric < 0.05  ~ "< 0.05",
      p_numeric < 0.10  ~ "< 0.10",
      TRUE              ~ "≥ 0.10"
    ),
    t = round(t, 2),
    df = round(df, 0),
    `t-statistic` = paste0("\\(t(", df, ") = ", t, "\\)")
  ) %>%
  rename(
    `Training type` = Training,
    Pre  = mean_pre,
    Post = mean_post,
    Diff = mean_diff
  ) %>%
  select(`Training type`, n, Pre, Post, Diff, `t-statistic`, p)

t_results_out |>
  kable(
    digits = 3,
    caption = "Paired t-tests (comparing error in pre- vs post-tests) per training type",
    escape = FALSE,
    align = "lcccccc",
  ) |>
  kable_styling(full_width = FALSE)
Paired t-tests (comparing error in pre- vs post-tests) per training type
Training type n Pre Post Diff t-statistic p
Control 22 17.407 12.780 4.924 \(t(20) = 3.18\) < 0.01
Physical 33 13.386 9.345 4.569 \(t(29) = 4.62\) < 0.001
Visual 27 14.976 11.728 4.316 \(t(24) = 3.02\) < 0.01
Code
df_prepost <- df_flag %>%
  filter(Test %in% c("Pre-test", "Post-test")) %>%
  select(Participant, Training, Test, Error) %>%
  pivot_wider(names_from = Test, values_from = Error)

t_results <- df_prepost %>%
  group_by(Training) %>%
  summarise(
    n         = n(),
    t         = t.test(`Pre-test`, `Post-test`, paired = TRUE)$statistic,
    df        = t.test(`Pre-test`, `Post-test`, paired = TRUE)$parameter,
    p_numeric = t.test(`Pre-test`, `Post-test`, paired = TRUE)$p.value,
    mean_pre  = mean(`Pre-test`,  na.rm = TRUE),
    mean_post = mean(`Post-test`, na.rm = TRUE),
    mean_diff = mean(`Pre-test` - `Post-test`, na.rm = TRUE),
    .groups = "drop"
  )

t_results_out <- t_results %>%
  mutate(
    p = case_when(
      p_numeric < 0.001 ~ "< 0.001",
      p_numeric < 0.01  ~ "< 0.01",
      p_numeric < 0.05  ~ "< 0.05",
      p_numeric < 0.10  ~ "< 0.10",
      TRUE              ~ "≥ 0.10"
    ),
    t = round(t, 2),
    df = round(df, 0),
    `t-statistic` = paste0("\\(t(", df, ") = ", t, "\\)")
  ) %>%
  rename(
    `Training type` = Training,
    Pre  = mean_pre,
    Post = mean_post,
    Diff = mean_diff
  ) %>%
  select(`Training type`, n, Pre, Post, Diff, `t-statistic`, p)

t_results_out |>
  kable(
    digits = 3,
    caption = "Paired t-tests (comparing error in pre- vs post-tests) per training type",
    escape = FALSE,
    align = "lcccccc",
  ) |>
  kable_styling(full_width = FALSE)
Paired t-tests (comparing error in pre- vs post-tests) per training type
Training type n Pre Post Diff t-statistic p
Control 23 18.991 13.392 5.599 \(t(22) = 3.69\) < 0.01
Physical 33 15.472 9.345 6.127 \(t(32) = 4.76\) < 0.001
Visual 27 16.909 11.728 5.181 \(t(26) = 3.56\) < 0.01

Baseline-adjusted graphs

Code
test_levels <- c("Mid-test","Post-test","Carryover")

pre_vals <- df_flag %>%
  filter(Test == "Pre-test", !is_outlier) %>%
  group_by(Participant, Training) %>%
  summarise(pre_val = mean(Error, na.rm = TRUE), .groups = "drop")

df_change <- df_flag %>%
  filter(!is_outlier) %>%
  left_join(pre_vals, by = c("Participant","Training")) %>%
  mutate(
    Change = Error - pre_val,
    Test = factor(Test, levels = test_levels),
    Group2 = if_else(Training == "Physical", "Physical", "Control+Visual")
  ) %>%
  filter(Test %in% test_levels) %>%
  drop_na(pre_val)

ggplot(df_change, aes(x = Change, y = Test, fill = Group2)) +
  geom_density_ridges(
    alpha = 0.7, scale = 1.1, rel_min_height = 0.02, colour = "grey25"
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_fill_manual(
    values = c("Control+Visual" = "grey70", "Physical" = "#4C9F70"),
    name = "Group"
  ) +
  theme_clean() +
  labs(
    title = "Change from baseline (Pre-test) across Test phases",
    subtitle = "Physical vs pooled Control+Visual groups",
    x = "Change in Error (from baseline)",
    y = "Test phase"
  )
Picking joint bandwidth of 2.51

Code
test_levels <- c("Mid-test", "Post-test", "Carryover")

pre_vals <- df_flag %>%
  filter(Test == "Pre-test", !is_outlier) %>%
  group_by(Participant, Training) %>%
  summarise(pre_val = mean(Error, na.rm = TRUE), .groups = "drop")

df_change <- df_flag %>%
  filter(!is_outlier) %>%
  left_join(pre_vals, by = c("Participant", "Training")) %>%
  mutate(
    Change = Error - pre_val,
    Test = factor(Test, levels = test_levels)
  ) %>%
  filter(Test %in% test_levels) %>%
  drop_na(pre_val)

ggplot(df_change, aes(x = Change, y = Test, fill = Training)) +
  geom_density_ridges(
    alpha = 0.7, scale = 1.1, rel_min_height = 0.02, colour = "grey25"
  ) +
  geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_fill_manual(
    values = c("Control" = "grey70",
               "Physical" = "#4C9F70",
               "Visual" = "#3E8ED0"),
    name = "Training type"
  ) +
  theme_clean() +
  labs(
    title = "Change from baseline (Pre-test) across Test phases",
    subtitle = "All training groups (Control, Physical, Visual)",
    x = "Change in Error (from baseline)",
    y = "Test phase"
  )
Picking joint bandwidth of 2.72

Code
df_postcarry <- df_flag %>%
  filter(!is_outlier, Test %in% c("Post-test", "Carryover")) %>%
  mutate(
    Test = factor(Test, levels = c("Post-test", "Carryover")),
    Training = factor(Training, levels = c("Control", "Physical", "Visual"))
  )

ggplot(df_postcarry, aes(x = Error, fill = Training)) +
  geom_density(alpha = 0.55, colour = "black", linewidth = 0.7, adjust = 1.2) +
  facet_wrap(~ Test, ncol = 2) +  # use ncol = 2 if you prefer side-by-side
  theme_clean() +
  labs(
    title = "Score distributions by Training group",
    subtitle = "Post-test and Carryover (outliers removed)",
    x = "Error",
    y = "Density",
    fill = "Training group"
  )

Code
df_prepost_long <- df_prepost %>%
  pivot_longer(
    cols = c(`Pre-test`, `Post-test`),
    names_to = "Test",
    values_to = "Error"
  )

ggplot(df_prepost_long, aes(x = Test, y = Error, fill = Test)) +
  geom_violin(alpha = 0.5, colour = "black", width = 0.9, adjust = 1.2) +
  geom_boxplot(width = 0.15, outlier.shape = NA, alpha = 0.7, colour = "black") +
  facet_wrap(~ Training) +
  scale_fill_manual(values = c("Pre-test" = "grey70", "Post-test" = "#4C9F70")) +
  theme_clean() +
  labs(
    title = "Pre- vs Post-test distributions per training group",
    subtitle = "Outliers removed",
    x = NULL,
    y = "Error",
    fill = "Test phase"
  )