This notebook generates quality control visualizations from the QC report CSV.
library(tidyverse)
library(here)
library(forcats)
# Specify which QC report to load
qc_filename <- "qc_report_20260511.csv" # <-- CHANGE THIS TO YOUR FILE
# Load the data
qc_data <- read_csv(here("data/qc_checks/", qc_filename)) |> filter(participant_id != "MW000")
# Quick look at the data
glimpse(qc_data)
## Rows: 676
## Columns: 19
## $ participant_id <chr> "MW001", "MW001", "MW001", "MW001", "MW001", "MW00…
## $ trial_index <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, …
## $ trial_name <chr> "sesameus_2", "sesameindia_2", "sesameus_1", "sesa…
## $ block_id <chr> "sesame", "sesame", "sesame", "sesame", "slow", "s…
## $ block_index <dbl> 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 0, 0, 0, 0, 1,…
## $ n_samples <dbl> 220, 14918, 14922, 14923, 14917, 14921, 14922, 149…
## $ trial_duration_sec <dbl> 7.13, 59.66, 59.68, 59.68, 59.65, 59.67, 59.68, 59…
## $ valid_duration_sec <dbl> 0.80, 51.71, 52.26, 53.13, 52.00, 53.83, 52.15, 51…
## $ left_n_valid <dbl> 200, 12974, 13285, 13425, 13089, 13639, 13254, 131…
## $ left_percent_valid <dbl> 90.91, 86.97, 89.03, 89.96, 87.75, 91.41, 88.82, 8…
## $ right_n_valid <dbl> 200, 12879, 12846, 13138, 12912, 13277, 12820, 127…
## $ right_percent_valid <dbl> 90.91, 86.33, 86.09, 88.04, 86.56, 88.98, 85.91, 8…
## $ mean_percent_valid <dbl> 90.91, 86.65, 87.56, 89.00, 87.15, 90.20, 87.37, 8…
## $ val_left_deg <dbl> 5.8668, 5.8668, 5.8668, 5.8668, 0.9201, 0.9201, 0.…
## $ val_right_deg <dbl> 6.1905, 6.1905, 6.1905, 6.1905, 1.1640, 1.1640, 1.…
## $ val_mean_deg <dbl> 6.0286, 6.0286, 6.0286, 6.0286, 1.0420, 1.0420, 1.…
## $ val_left_px <dbl> 18.7385, 18.7385, 18.7385, 18.7385, 2.9389, 2.9389…
## $ val_right_px <dbl> 19.7726, 19.7726, 19.7726, 19.7726, 3.7178, 3.7178…
## $ val_mean_px <dbl> 19.2555, 19.2555, 19.2555, 19.2555, 3.3283, 3.3283…
One bar per participant showing mean % valid (averaged across both eyes and all trials), with error bars reflecting variance across trials.
participant_summary <- qc_data %>%
mutate(participant_type = case_when(
startsWith(participant_id, "MWK") ~ "kids (5-7 yos)",
startsWith(participant_id, "MW") ~ "adults",
startsWith(participant_id, "HMET") ~ "infants (12-24 mos)"
)) %>%
group_by(participant_id, participant_type) %>%
summarise(
mean_valid = mean(mean_percent_valid),
sd_valid = sd(mean_percent_valid),
n_trials = n(),
)
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by participant_id and participant_type.
## ℹ Output is grouped by participant_id.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(participant_id, participant_type))` for per-operation
## grouping (`?dplyr::dplyr_by`) instead.
ggplot(participant_summary, aes(x = participant_id, y = mean_valid)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_errorbar(aes(ymin = mean_valid - sd_valid,
ymax = mean_valid + sd_valid),
width = 0.2) +
labs(
title = "Data Validity by Participant",
x = "Participant",
y = "Mean % Valid (both eyes)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Plot 1b: Validity by Participant Group
summarized_data <- function(data, x_var, y_var, group_var) {
return(data %>%
group_by_at(c(x_var, group_var)) %>%
summarise(mean_value = mean(.data[[y_var]], na.rm = TRUE),
sd_value = sd(.data[[y_var]], na.rm = TRUE),
n = n(),
se = sd_value / sqrt(n()),
ci_lower = mean_value - qt(1 - (0.05 / 2), n - 1) * se,
ci_upper = mean_value + qt(1 - (0.05 / 2), n - 1) * se,
.groups = 'drop')
)
}
participant_type_summary <- summarized_data(participant_summary |> filter(mean_valid > 50), "participant_type", "mean_valid", "participant_type")
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `ci_lower = mean_value - qt(1 - (0.05/2), n - 1) * se`.
## ℹ In group 2: `participant_type = "infants (12-24 mos)"`.
## Caused by warning in `qt()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
participant_type_summary$participant_type <-
fct_relevel(participant_type_summary$participant_type, "adults", after = Inf)
participant_type_summary_plot <- ggplot(participant_type_summary, aes(x=participant_type, y=mean_value)) +
geom_point(size=4) +
geom_errorbar(aes(ymin=ci_lower, ymax=ci_upper), width=0) +
geom_jitter(data=participant_summary, aes(x=participant_type, y=mean_valid), alpha=0.5, width=0.1) +
ylab("Mean valid % (both eyes)") +
geom_hline(yintercept=50, linetype="dotted") +
xlab("Participant type")
participant_type_summary_plot
ggsave(here("figures/valid_data_per_group.png"), participant_type_summary_plot)
## Saving 12 x 8 in image
Shows validity trends by stimulus, with each participant as a separate line.
ggplot(qc_data, aes(x = trial_name, y = mean_percent_valid,
color = participant_id, group = participant_id)) +
geom_line() +
geom_point() +
labs(
title = "Data Validity by Stimulus",
x = "Trial Name",
y = "% Valid (both eyes)",
color = "Participant"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
mw031_data <- qc_data |> filter(participant_id == 'MW031') |>
pivot_longer(cols=c(left_percent_valid, right_percent_valid), names_to="eye", values_to="percent_valid")
ggplot(mw031_data, aes(x = trial_name, y = percent_valid,
color = eye, group = eye)) +
geom_line() +
geom_point() +
labs(
title = "Data Validity by Stimulus",
x = "Trial Name",
y = "% Valid (left/right)",
color = "Eye"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Shows validity trends over the course of the experiment, with each participant as a separate line.
ggplot(qc_data, aes(x = trial_index, y = mean_percent_valid,
color = participant_id, group = participant_id)) +
geom_line() +
geom_point() +
labs(
title = "Data Validity Over Time",
x = "Trial Index",
y = "% Valid (both eyes)",
color = "Participant"
) +
scale_x_continuous(breaks = seq(0, max(qc_data$trial_index), 1)) +
theme_minimal()
qc_data_trial_mean <- qc_data |>
group_by(trial_index) |>
summarize(
group_mean_valid = mean(mean_percent_valid),
ci = qt(0.975, df = n() - 1) * sd(mean_percent_valid) / sqrt(n())
)
n_participant <- length(unique(qc_data$participant_id))
ggplot(qc_data, aes(x = trial_index, y = mean_percent_valid,
color = as.numeric(str_extract(participant_id, "\\d+$")), group = participant_id)) +
geom_line(alpha = 0.4) +
geom_line(data = qc_data_trial_mean, color = "black", aes(y = group_mean_valid, group = NA)) +
geom_ribbon(data = qc_data_trial_mean,
aes(y = group_mean_valid,
ymin = group_mean_valid - ci,
ymax = group_mean_valid + ci,
group = NA),
fill = "black", alpha = 0.2, color = NA) +
labs(
title = "Data Validity Over Time",
x = "Trial Index",
y = "% Valid (both eyes)",
color = "Participant"
) +
scale_color_viridis_c(
name = "Participant",
breaks = seq(1, n_participant, by = 4),
labels = seq(1, n_participant, by = 4),
limits = c(1, n_participant)
) +
scale_x_continuous(breaks = seq(0, max(qc_data$trial_index), 1)) +
theme_minimal() +
guides(color = guide_colorbar(barwidth = 4, barheight = 5, title.position = "top"))
Shows validation accuracy (in degrees) across blocks, with one line per participant. Higher values = worse calibration.
# Get one row per participant per block (they're repeated across trials)
block_calibration <- qc_data %>%
select(participant_id, block_index, val_mean_deg) %>%
distinct()
ggplot(block_calibration, aes(x = block_index, y = val_mean_deg,
color = participant_id, group = participant_id)) +
geom_line() +
geom_point() +
labs(
title = "Calibration Drift Across Blocks",
x = "Block Index",
y = "Validation Accuracy (degrees)",
color = "Participant"
) +
scale_x_continuous(breaks = 0:3) +
theme_minimal()
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
Scatter plot showing whether worse calibration (higher degrees) is associated with more data loss (lower % valid). Each dot is one trial, colored by participant.
ggplot(qc_data, aes(x = val_mean_deg, y = mean_percent_valid,
color = participant_id)) +
geom_point(size = 2, alpha = 0.7) +
labs(
title = "Calibration Accuracy vs Data Validity",
x = "Validation Accuracy (degrees, lower = better)",
y = "% Valid (both eyes)",
color = "Participant"
) +
theme_minimal()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
summary_data <- qc_data %>%
group_by(participant_id) %>%
summarise(
n_trials = n(),
mean_validity = round(mean(mean_percent_valid), 1),
min_validity = round(min(mean_percent_valid), 1),
mean_calibration_deg = round(mean(val_mean_deg), 2),
max_calibration_deg = round(max(val_mean_deg), 2)
)
summary_data |>
knitr::kable()
| participant_id | n_trials | mean_validity | min_validity | mean_calibration_deg | max_calibration_deg |
|---|---|---|---|---|---|
| HMET004 | 10 | 28.7 | 0.5 | 3.72 | 13.97 |
| HMET005 | 11 | 57.9 | 33.8 | 7.50 | 12.70 |
| HMET007 | 7 | 14.5 | 1.6 | NA | NA |
| MW001 | 12 | 87.3 | 77.2 | 2.70 | 6.03 |
| MW002 | 12 | 91.1 | 86.5 | 1.40 | 2.02 |
| MW003 | 12 | 93.6 | 91.8 | 1.25 | 1.47 |
| MW004 | 12 | 55.4 | 48.1 | 6.67 | 12.19 |
| MW005 | 12 | 96.7 | 94.8 | 1.50 | 2.18 |
| MW006 | 12 | 97.8 | 96.9 | 1.47 | 1.66 |
| MW008 | 12 | 51.5 | 11.6 | 1.34 | 5.92 |
| MW009 | 12 | 62.4 | 26.4 | 4.71 | 12.52 |
| MW010 | 12 | 84.0 | 79.4 | 1.14 | 1.35 |
| MW011 | 12 | 95.0 | 94.2 | 1.36 | 1.53 |
| MW012 | 12 | 90.5 | 87.8 | 1.66 | 1.98 |
| MW013 | 12 | 90.3 | 83.6 | 7.00 | 17.44 |
| MW014 | 12 | 82.1 | 64.3 | 2.03 | 2.56 |
| MW015 | 12 | 96.3 | 94.7 | 1.42 | 1.54 |
| MW016 | 12 | 95.8 | 94.5 | 1.22 | 1.30 |
| MW017 | 12 | 93.5 | 91.0 | 1.41 | 1.86 |
| MW018 | 12 | 96.7 | 95.6 | 1.07 | 1.24 |
| MW019 | 12 | 78.3 | 60.9 | 1.51 | 1.75 |
| MW020 | 12 | 88.0 | 81.1 | 1.71 | 3.39 |
| MW021 | 12 | 79.9 | 60.1 | 1.84 | 2.86 |
| MW022 | 12 | 98.8 | 97.4 | 3.94 | 6.28 |
| MW024 | 12 | 91.9 | 89.2 | 1.29 | 1.83 |
| MW025 | 12 | 98.1 | 96.3 | 1.15 | 1.39 |
| MW026 | 12 | 92.2 | 88.9 | 3.51 | 6.19 |
| MW027 | 12 | 91.9 | 89.3 | 1.96 | 2.71 |
| MW028 | 12 | 96.5 | 94.8 | 1.31 | 1.43 |
| MW029 | 12 | 94.8 | 89.5 | 1.46 | 1.80 |
| MW030 | 12 | 94.1 | 88.3 | 1.14 | 1.50 |
| MW031 | 12 | 69.7 | 53.4 | 2.68 | 3.02 |
| MW032 | 12 | 92.0 | 89.1 | 1.21 | 1.84 |
| MW033 | 12 | 89.6 | 83.1 | 2.25 | 3.50 |
| MW034 | 12 | 97.3 | 94.7 | 6.33 | 11.41 |
| MW036 | 12 | 92.7 | 87.3 | 1.03 | 1.35 |
| MW037 | 12 | 92.0 | 87.2 | 1.30 | 1.71 |
| MW038 | 12 | 90.5 | 88.8 | 0.84 | 1.02 |
| MW039 | 12 | 87.9 | 81.1 | 11.53 | 18.27 |
| MW040 | 12 | 93.5 | 91.6 | 1.21 | 1.33 |
| MW041 | 12 | 98.0 | 97.1 | 1.25 | 1.59 |
| MW042 | 12 | 98.6 | 97.4 | 1.24 | 1.63 |
| MW043 | 12 | 95.2 | 93.2 | 1.24 | 1.38 |
| MW044 | 12 | 94.2 | 87.8 | 1.50 | 2.32 |
| MW045 | 12 | 96.5 | 93.1 | 3.34 | 6.48 |
| MW046 | 12 | 96.1 | 92.4 | 0.90 | 1.07 |
| MW047 | 12 | 99.0 | 98.1 | 1.07 | 1.28 |
| MW048 | 12 | 96.7 | 92.7 | 0.81 | 1.12 |
| MW049 | 12 | 97.3 | 96.1 | 1.40 | 1.57 |
| MW050 | 12 | 96.4 | 94.1 | 4.63 | 4.77 |
| MW051 | 12 | 93.5 | 90.8 | 1.13 | 1.27 |
| MWK004 | 12 | 66.2 | 0.0 | 1.61 | 1.80 |
| MWK005 | 12 | 28.0 | 1.1 | 11.10 | 31.49 |
| MWK006 | 12 | 92.6 | 72.2 | 5.36 | 7.69 |
| MWK01 | 12 | 69.4 | 0.0 | NA | NA |
| MWK02 | 12 | 91.9 | 69.4 | 0.99 | 1.30 |
| MWK03 | 12 | 85.5 | 69.1 | 2.52 | 3.31 |
write.csv(summary_data, here("data/qc_checks/summary.csv"))