Center of Mass Pre-Processing

# Load required libraries
library(readxl)
Warning: package 'readxl' was built under R version 4.4.1
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(purrr)
library(stringr)
library(tidyverse)
Warning: package 'tidyr' was built under R version 4.4.1
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#1. load all xsens data per ID

# List of file paths and corresponding block numbers
file_paths <- c(
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/xlsx/id22-001.xlsx",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/xlsx/id22-002.xlsx",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/xlsx/id22-003.xlsx",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/xlsx/id22-004.xlsx",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/xlsx/id22-005.xlsx"
)

# Optional: extract filenames for output later
file_names <- str_extract(basename(file_paths), "id22-\\d+")

# Loop over each file and process
for (i in seq_along(file_paths)) {
  
  file_path <- file_paths[i]
  block_number <- i  # Assign Block 1–5 based on order
  
  # Read the sheets
  markers <- read_excel(file_path, sheet = "Markers")
  com <- read_excel(file_path, sheet = "Center of Mass")
  
  # Merge the datasets (left join to keep all Center of Mass rows)
  merged_data <- left_join(com, markers, by = "Frame")
  
  # Add Block column
  merged_data <- merged_data %>%
    mutate(Block = block_number)
  
  # Clean Marker Text to ensure it's numeric
  merged_data$`Marker Text` <- as.numeric(trimws(merged_data$`Marker Text`))
  
  # Find the first index where Marker Text is 28
  start_index <- which(merged_data$`Marker Text` == 28)[1]
  
  # Filter from that index onward (if 28 exists)
  if (!is.na(start_index)) {
    filtered_data <- merged_data[start_index:nrow(merged_data), ]
  } else {
    warning(paste("No '28' marker found in:", file_path))
    filtered_data <- merged_data
  }
  
  # Add time columns
  filtered_data <- filtered_data %>%
    mutate(
      ms = Frame * (1000 / 60),
      s = Frame / 60
    )
  
# Define your output folder
output_folder <- "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv"

# Save each processed file into that folder
write.csv(
  filtered_data,
  file = file.path(output_folder, paste0(file_names[i], "_processed.csv")),
  row.names = FALSE
)
  
  # Write to CSV (optional)
  write.csv(filtered_data, paste0(file_names[i], "_processed.csv"), row.names = FALSE)
}
Warning: No '28' marker found in:
/Users/can/Documents/Uni/Thesis/Data/Xsens/xlsx/id22-001.xlsx

#2. merge files, so that all blocks are in one dataset

# List of file names

file_list <- c(
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/id22-001_processed.csv",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/id22-002_processed.csv",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/id22-003_processed.csv",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/id22-004_processed.csv",
  "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/id22-005_processed.csv"
)
# Load and merge all files
id22_all <- do.call(rbind, lapply(file_list, read.csv))

#2.1 check if merged correctly, there should be around 48 times the markers “27” & “30” per block

# Make sure 'Marker Text' is numeric
id22_all$`Marker.Text` <- as.numeric(id22_all$`Marker.Text`)

# Count 27s per block
count_27_per_block <- id22_all %>%
  group_by(Block) %>%
  summarise(count_27 = sum(`Marker.Text` == 27, na.rm = TRUE))

# Count 30s per block
count_30_per_block <- id22_all %>%
  group_by(Block) %>%
  summarise(count_30 = sum(`Marker.Text` == 30, na.rm = TRUE))

# View result
print(count_27_per_block)
# A tibble: 5 × 2
  Block count_27
  <int>    <int>
1     1       48
2     2       48
3     3       44
4     4       48
5     5       47
print(count_30_per_block)
# A tibble: 5 × 2
  Block count_30
  <int>    <int>
1     1       48
2     2       48
3     3       44
4     4       48
5     5       47

#2.1.1 check if marker amounts are roughly right, will be further processed later

# Count each unique value in "Marker Text" per Block
marker_counts_per_block <- id22_all %>%
  group_by(Block, `Marker.Text`) %>%
  summarise(count = n(), .groups = "drop")

# View result
print(marker_counts_per_block)
# A tibble: 88 × 3
   Block Marker.Text count
   <int>       <dbl> <int>
 1     1           5   104
 2     1           6   104
 3     1           7    52
 4     1           8    52
 5     1          14    48
 6     1          15    94
 7     1          16    48
 8     1          17    98
 9     1          24     4
10     1          25     5
# ℹ 78 more rows

#3. save to further process any time

# Define your output path
output_folder <- "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/merged"  # adjust as needed
output_file <- file.path(output_folder, "id22-all.csv")

# Save the CSV
write.csv(id22_all, output_file, row.names = FALSE)

#4. add trialnumbers and load file if necessary

id22_all <- read.csv("/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/merged/id22-all.csv")

# Ensure Marker.Text is numeric (again, just in case)
id22_all$`Marker.Text` <- as.numeric(id22_all$`Marker.Text`)

# Add trial numbers that increment at Marker.Text == 27 and reset for each block
id22_all <- id22_all %>%
  group_by(Block) %>%
  mutate(
    trial_start = ifelse(`Marker.Text` == 27, 1, NA),        # Flag where trial starts
    trial = cumsum(!is.na(trial_start)),                     # Count up at each 27
    trial = ifelse(trial == 0, NA, trial)                    # Replace 0 with NA if needed
  ) %>%
  fill(trial, .direction = "down") %>%                       # Fill trial number forward
  ungroup() %>%
  select(-trial_start)                                       # Optional: remove helper column

#5.finding out which trials are to be excluded due to missing datapoints. #first it will be looked at if the amount of data deviates for a trial within a block too much #additionally the amount of steps per trial are checked

# Step 1: Count number of rows (data points) per trial
trial_lengths <- id22_all %>%
  group_by(Block, trial) %>%
  summarise(
    n_rows = n(),
    .groups = "drop"
  )

# Step 2: Identify outlier trials based on row counts
trial_stats <- trial_lengths %>%
  group_by(Block) %>%
  mutate(
    median_rows = median(n_rows, na.rm = TRUE),
    iqr_rows = IQR(n_rows, na.rm = TRUE),
    lower_bound = median_rows - 1.5 * iqr_rows,
    upper_bound = median_rows + 1.5 * iqr_rows,
    is_row_outlier = n_rows < lower_bound | n_rows > upper_bound
  ) %>%
  ungroup()

# Step 3: Count marker 5–8 occurrences per trial
marker_counts <- id22_all %>%
  filter(`Marker.Text` %in% 5:8) %>%
  group_by(Block, trial, `Marker.Text`) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(
    names_from = `Marker.Text`,
    names_prefix = "marker_",
    values_from = count,
    values_fill = 0
  ) %>%
  mutate(marker_total = marker_5 + marker_6 + marker_7 + marker_8)

# Step 4: Merge row stats and marker counts into one summary table
trial_quality_summary <- trial_stats %>%
  left_join(marker_counts, by = c("Block", "trial"))

# Step 5: (Optional) View only suspicious trials
flagged_trials <- trial_quality_summary %>%
  filter(is_row_outlier | marker_total != 48)  # assumes 48 presses expected per trial

# Output full and flagged summaries
print(trial_quality_summary)
# A tibble: 240 × 13
   Block trial n_rows median_rows iqr_rows lower_bound upper_bound
   <int> <int>  <int>       <dbl>    <dbl>       <dbl>       <dbl>
 1     1     1   1419         899      204         593        1205
 2     1     2   1145         899      204         593        1205
 3     1     3   1054         899      204         593        1205
 4     1     4   1064         899      204         593        1205
 5     1     5   1270         899      204         593        1205
 6     1     6   1011         899      204         593        1205
 7     1     7    998         899      204         593        1205
 8     1     8   1003         899      204         593        1205
 9     1     9   1008         899      204         593        1205
10     1    10   1633         899      204         593        1205
# ℹ 230 more rows
# ℹ 6 more variables: is_row_outlier <lgl>, marker_5 <int>, marker_6 <int>,
#   marker_7 <int>, marker_8 <int>, marker_total <int>
print(flagged_trials)
# A tibble: 238 × 13
   Block trial n_rows median_rows iqr_rows lower_bound upper_bound
   <int> <int>  <int>       <dbl>    <dbl>       <dbl>       <dbl>
 1     1     1   1419         899      204         593        1205
 2     1     2   1145         899      204         593        1205
 3     1     3   1054         899      204         593        1205
 4     1     4   1064         899      204         593        1205
 5     1     5   1270         899      204         593        1205
 6     1     6   1011         899      204         593        1205
 7     1     7    998         899      204         593        1205
 8     1     8   1003         899      204         593        1205
 9     1     9   1008         899      204         593        1205
10     1    10   1633         899      204         593        1205
# ℹ 228 more rows
# ℹ 6 more variables: is_row_outlier <lgl>, marker_5 <int>, marker_6 <int>,
#   marker_7 <int>, marker_8 <int>, marker_total <int>

merging with e-prime data

df_lc <- read.csv("/Users/can/Documents/Uni/Thesis/Data/E-Prime/df_lc.csv")

#important to adjust step 3

# Step 2: Create table of bad trials
bad_trials <- df_lc %>%
  filter(trial.acc == 0) %>%
  select(subject, session, trial) %>%
  mutate(bad_trial = TRUE)

# Step 3: Add subject column to id22_all if needed (assumes 1 subject here, e.g., subject 3)
# Adjust this line as needed if you're processing per participant
id22_all$subject <- 22  # Replace with actual subject number if known

# Step 4: Join to mark bad trials
id22_all <- id22_all %>%
  left_join(bad_trials, by = c("subject", "Block" = "session", "trial")) %>%
  mutate(bad_trial = ifelse(is.na(bad_trial), FALSE, bad_trial))

# View a few rows
head(id22_all)
# A tibble: 6 × 17
  Frame CoM.pos.x CoM.pos.y CoM.pos.z CoM.vel.x CoM.vel.y  CoM.vel.z CoM.acc.x
  <dbl>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl>      <dbl>     <dbl>
1     0    -0.779     0.580     0.966 -0.00229   -0.00205 -0.00252     0.00214
2     1    -0.779     0.580     0.966  0.000161  -0.00304 -0.00206     0.0838 
3     2    -0.779     0.580     0.966  0.00236   -0.00394 -0.00161     0.106  
4     3    -0.779     0.579     0.966  0.00421   -0.00462 -0.00112     0.119  
5     4    -0.779     0.579     0.966  0.00529   -0.00471 -0.000598    0.110  
6     5    -0.779     0.579     0.966  0.00600   -0.00469 -0.0000529   0.111  
# ℹ 9 more variables: CoM.acc.y <dbl>, CoM.acc.z <dbl>, Marker.Text <dbl>,
#   Block <int>, ms <dbl>, s <dbl>, trial <int>, subject <dbl>, bad_trial <lgl>
# Step 1: Fix trial numbering in df_lc
df_lc <- df_lc %>%
  group_by(subject, session) %>%
  mutate(trial_within_block = row_number()) %>%
  ungroup()

# Step 2: Prepare list of bad trials
bad_trials <- df_lc %>%
  filter(trial.acc == 0) %>%
  select(subject, session, trial_within_block) %>%
  rename(trial = trial_within_block) %>%
  mutate(temp_flag = TRUE)

# Step 3: Add subject to trial_quality_summary only if not already there
if (!"subject" %in% colnames(trial_quality_summary)) {
  trial_subject_info <- id22_all %>%
    select(Block, trial, subject) %>%
    distinct()
  
  trial_quality_summary <- trial_quality_summary %>%
    left_join(trial_subject_info, by = c("Block", "trial"))
}

# Step 4: Join bad trial info using correct keys
trial_quality_summary <- trial_quality_summary %>%
  left_join(bad_trials, by = c("subject", "Block" = "session", "trial")) %>%
  mutate(bad_trial = ifelse(is.na(temp_flag), FALSE, TRUE)) %>%
  select(-temp_flag)

# Step 5: Clean up extra subject columns (if created)
trial_quality_summary <- trial_quality_summary %>%
  select(-matches("subject\\.[xy]$"))

# ✅ Done
head(trial_quality_summary)
# A tibble: 6 × 15
  Block trial n_rows median_rows iqr_rows lower_bound upper_bound is_row_outlier
  <int> <int>  <int>       <dbl>    <dbl>       <dbl>       <dbl> <lgl>         
1     1     1   1419         899      204         593        1205 TRUE          
2     1     2   1145         899      204         593        1205 FALSE         
3     1     3   1054         899      204         593        1205 FALSE         
4     1     4   1064         899      204         593        1205 FALSE         
5     1     5   1270         899      204         593        1205 TRUE          
6     1     6   1011         899      204         593        1205 FALSE         
# ℹ 7 more variables: marker_5 <int>, marker_6 <int>, marker_7 <int>,
#   marker_8 <int>, marker_total <int>, subject <dbl>, bad_trial <lgl>
# Step 1: Renumber trials within blocks in df_lc and keep accuracy & timing
trial_metrics <- df_lc %>%
  group_by(subject, session) %>%
  mutate(trial = row_number()) %>%
  ungroup() %>%
  select(subject, session, trial, trial.acc, trial.RT, trial.RTS) %>%
  mutate(bad_trial = trial.acc == 0)

# Step 2: Add subject info to trial_quality_summary (if not already present)
trial_subject_info <- id22_all %>%
  select(Block, trial, subject) %>%
  distinct()

trial_quality_summary <- trial_quality_summary %>%
  select(-matches("subject\\.?[xy]?$")) %>%
  left_join(trial_subject_info, by = c("Block", "trial"))

# Step 3: Join accuracy + RT info from df_lc
trial_quality_summary <- trial_quality_summary %>%
  left_join(trial_metrics, by = c("subject", "Block" = "session", "trial"))



# ✅ Done
head(trial_quality_summary)
# A tibble: 6 × 19
  Block trial n_rows median_rows iqr_rows lower_bound upper_bound is_row_outlier
  <int> <int>  <int>       <dbl>    <dbl>       <dbl>       <dbl> <lgl>         
1     1     1   1419         899      204         593        1205 TRUE          
2     1     2   1145         899      204         593        1205 FALSE         
3     1     3   1054         899      204         593        1205 FALSE         
4     1     4   1064         899      204         593        1205 FALSE         
5     1     5   1270         899      204         593        1205 TRUE          
6     1     6   1011         899      204         593        1205 FALSE         
# ℹ 11 more variables: marker_5 <int>, marker_6 <int>, marker_7 <int>,
#   marker_8 <int>, marker_total <int>, bad_trial.x <lgl>, subject <dbl>,
#   trial.acc <int>, trial.RT <dbl>, trial.RTS <dbl>, bad_trial.y <lgl>
wrong_trials_summary <- trial_quality_summary %>%
  filter(bad_trial.x == TRUE) %>%
  group_by(subject, Block) %>%
  summarise(
    bad_trials = paste(sort(trial), collapse = ", "),
    n_bad_trials = n(),
    .groups = "drop"
  )

# View the result
print(wrong_trials_summary)
# A tibble: 3 × 4
  subject Block bad_trials                                          n_bad_trials
    <dbl> <int> <chr>                                                      <int>
1      22     1 1, 2, 4, 12, 41                                                5
2      22     2 1, 2, 3, 4, 5, 7, 11, 14, 15, 16, 17, 18, 19, 20, …           31
3      22     3 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16,…           41
# Step 1: Get trial starts (marker 27)
trial_starts <- id22_all %>%
  filter(Marker.Text == 27) %>%
  select(subject, Block, trial, ms) %>%
  rename(start_trial = trial, start_ms = ms)

# Step 2: Assign 1100ms before each trial start to that trial (even if it was previously NA or deleted)
# Add a unique row ID for tracking
id22_all <- id22_all %>%
  mutate(row_id = row_number())

# Get all rows that fall within 1100ms before any trial start
pre_trial_buffer <- id22_all %>%
  filter(!is.na(ms)) %>%
  inner_join(trial_starts, by = c("subject", "Block")) %>%
  filter(ms >= (start_ms - 1100) & ms < start_ms) %>%
  select(row_id, reassigned_trial = start_trial)
Warning in inner_join(., trial_starts, by = c("subject", "Block")): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
# Step 3: Apply the reassigned trial numbers to the original data
id22_all <- id22_all %>%
  left_join(pre_trial_buffer, by = "row_id") %>%
  mutate(
    trial = ifelse(!is.na(reassigned_trial), reassigned_trial, trial)
  ) %>%
  select(-row_id, -reassigned_trial)


# Step 4: Define trials to remove
trials_to_remove_list <- list(
  "1" = c(1, 2, 4, 12, 41, 10, 15, 26, 45, 48),
  "2" = c(1, 2, 3, 4, 5, 7, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 40, 45, 48, 13, 41, 47, 48),
  "3" = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44)
)

trials_to_remove_df <- bind_rows(
  lapply(names(trials_to_remove_list), function(block) {
    data.frame(Block = as.integer(block), trial = trials_to_remove_list[[block]])
  })
)

# Step 5: Filter out unwanted trials (excluding reassigned pre-trial buffers)
id22_clean <- id22_all %>%
  left_join(trials_to_remove_df %>% mutate(to_remove = TRUE), by = c("Block", "trial")) %>%
  filter(is.na(to_remove)) %>%
  select(-to_remove)
Warning in left_join(., trials_to_remove_df %>% mutate(to_remove = TRUE), : Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 148514 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
# Define which marker values count as steps
step_markers <- c(14, 15, 16, 17)

# Add step_number column
id22_clean <- id22_clean %>%
  left_join(
    trial_quality_summary %>%
      select(subject, Block, trial, trial.RT, trial.RTS),
    by = c("subject", "Block", "trial")
  )
id22_clean <- id22_clean %>%
  select(-matches("^trial\\.[xy]$"), -bad_trial)
# Define output folder
output_folder <- "/Users/can/Documents/Uni/Thesis/Data/Xsens/cleaned_csv/merged/Cleaned"

# Define file name and full path
output_file <- file.path(output_folder, "id22_clean.csv")

# Write the file
write.csv(id22_clean, output_file, row.names = FALSE)