library(readxl)
library(dplyr)
library(tidyverse)
library(openxlsx)
library(stringr)
library(purrr)
library(janitor)

Configuration: Define directories and file paths

#CONFIGURATION: SET TARGET PARTICIPANT ID
PARTICIPANT_ID <- 2

base_dir <- "/Volumes/PHILIPS UFD"
xsens_exec_dir <- file.path("/Users/vivi/UT/Thesis/Data/Xsens/Participants", "Movement Execution Period", paste0("ID", PARTICIPANT_ID))
eprime_clean_file <- file.path("/Users/vivi/UT/Thesis/Data/E-Prime", "rt_cleaned_merged.xlsx")

if (!dir.exists(xsens_exec_dir)) {
  dir.create(xsens_exec_dir, recursive = TRUE)
  message(paste("Created Xsens output directory:", xsens_exec_dir))
}

Xsens execution period preprocessing

message(paste("Processing movement execution data for Participant ID:", PARTICIPANT_ID))

process_participant_exec <- function(pid) {
  
  message(paste("Starting processing for Subject", pid, "..."))
  
  # 1. Detect and filter files for the current participant 
  # Using recursive=TRUE to search the base_dir and all subdirectories
  all_files <- list.files(base_dir, pattern = sprintf("ID%s-[0-9]+\\.xlsx$", pid), 
                          recursive = TRUE, full.names = TRUE)
  
  # Filter files to ensure we only get ID{pid}-{Block}.xlsx files
  #participant_files <- all_files[grepl(sprintf("/ID%s-[0-9]+\\.xlsx$", pid), all_files)]
  participant_files <- all_files[grepl(sprintf("ID%s\\b-[0-9]+\\.xlsx$", pid), all_files)]  
  
  # Extract block numbers from filenames
  block_numbers <- as.numeric(str_extract(participant_files, "(?<=ID[0-9]{1,2}-)[0-9]+"))

  if (length(participant_files) == 0) {
    warning(sprintf("No files found for Subject %s in %s. Skipping.", pid, base_dir))
    return(NULL)
  }
  
  # 2. Read, Clean, and Combine Data 
  combined_data_list <- purrr::map2(
    participant_files, block_numbers,
    ~{
      block_num <- .y
      file_path <- .x
      
      # Use tryCatch for robust file reading
      df_result <- tryCatch({
        com <- read_excel(file_path, sheet = "Center of Mass") %>% clean_names()
        markers <- read_excel(file_path, sheet = "Markers") %>% clean_names()
        
        merged_df <- left_join(com, markers, by = "frame") %>%
          # CRITICAL FIX: Ensure Subject and Block are defined before returning
          mutate(Subject = as.integer(pid), Block = block_num)
        
        # Check for marker_text column and rename if necessary
        if (!"marker_text" %in% colnames(merged_df)) {
          if ("ns1_marker" %in% colnames(merged_df)) {
              merged_df <- merged_df %>% rename(marker_text = ns1_marker)
          } else {
              warning(sprintf("Marker column missing in Block %s of Subject %s. Skipping.", block_num, pid))
              return(NULL)
          }
        }
        return(merged_df)
      }, error = function(e) {
        warning(sprintf("Error reading file %s: %s", basename(file_path), e$message))
        return(NULL)
      })
      
      return(df_result)
    }
  )
  
  # Remove NULL entries and combine the list into a single dataframe
  combined_data <- bind_rows(combined_data_list)
  
  if (nrow(combined_data) == 0) {
    message(sprintf("No valid data combined for Subject %s. Returning NULL.", pid))
    return(NULL)
  }
  
  # Numeric conversion
  combined_data <- combined_data %>%
    mutate(marker_text = as.numeric(trimws(as.character(marker_text))))
  
  # 3. Trial Numbering
  combined_data_numbered <- combined_data %>%
    group_by(Subject, Block) %>%
    mutate(
      # Use tolerance to check for 27
      is_go = (abs(marker_text - 27) < 0.001), 
      # Handle NA in 'is_go' and ensure 'Trial' is an integer
      Trial = as.integer(cumsum(replace_na(is_go, FALSE))),
      row_index = row_number()
    ) %>%
  ungroup()

  # Apply filter
  combined_data_numbered <- combined_data_numbered %>%
    filter(Trial > 0)
  
  # 4. Filter to Movement Execution Period (150 frames AFTER GO)

  # Calculate the length of each trial (needed for the data point check)
  trial_lengths <- combined_data_numbered %>%
    group_by(Subject, Block, Trial) %>%
    summarise(
      n_frames = n(),
      max_row_index = max(row_index, na.rm = TRUE),
      .groups = 'drop'
    )
  
  # Find the global index of every GO marker (within the block/subject grouping)
  go_indices_df <- combined_data_numbered %>%
    filter(is_go) %>%
    # Select the index *of the row that has the GO marker*
    select(Subject, Block, Trial, go_index = row_index)
  
  # Calculate the start and end row indices for the 150-frame window
 exec_windows <- go_indices_df %>%
   left_join(trial_lengths, by = c("Subject", "Block", "Trial")) %>%
    mutate(
      start_index = go_index - 1,
      end_index   = go_index + 149,    # 150 rows after GO
      is_valid = end_index <= max_row_index
    ) %>%
    # Use row_number() to generate a consistent trial index for the join later
    select(Subject, Block, Trial, start_index, end_index, max_row_index, is_valid)
  
  # Check if every trial has at least 150 data points available
  invalid_trials <- exec_windows %>%
    filter(is_valid == FALSE) 

  if (nrow(invalid_trials) > 0) {
    # Report which specific trials are too short
    # max_row_index and end_index are now available in invalid_trials
    short_trial_list <- invalid_trials %>%
      mutate(info = sprintf("Block %d, Trial %d (Ends at frame %d, requires up to frame %d)", Block, Trial, max_row_index, end_index)) %>%
      pull(info)

    message(sprintf("⚠️ WARNING: Subject %s has %d trial(s) too short to extract 150 frames after the first step:", pid, nrow(invalid_trials)))
    message(paste(short_trial_list, collapse = "\n"))

    # Filter out the invalid trials for the rest of the processing
    exec_windows <- exec_windows %>% filter(is_valid == TRUE)

    if (nrow(exec_windows) == 0) {
      message(sprintf("❌ ERROR: All trials for Subject %s were invalid. Skipping subject.", pid))
      return(NULL)
    }
  }
  
  # Now, join the main data back to these index windows and filter
  if (nrow(exec_windows) > 0) { # <--- ADD THIS CHECK
    exec_data <- purrr::map_dfr(1:nrow(exec_windows), ~ {
      window <- exec_windows[.x, ]
      combined_data_numbered %>%
      # Filter to the specific index range for this trial
        filter(
          Subject == window$Subject,
          Block == window$Block,
          row_index > window$start_index, 
          row_index <= window$end_index
        ) %>%
        # Add the trial number and block number from the window (crucial if data is missing)
        mutate(
          Trial = window$Trial,
          Block = window$Block,
          # Recalculate the row index within the 150-frame execution period
          row_in_trial = row_number() 
        )
    })
  } else {
    # If no exec windows found, set exec_data to an empty dataframe
    message(sprintf("WARNING: No valid execution windows found for Subject %s.", pid))
    exec_data <- tibble() # Or a structure matching what it should contain
  }

  # DEBUG CHECK: Ensure we have data
  if (nrow(exec_data) == 0) {
    message(sprintf("ERROR: Execution period filter produced zero rows for Subject %s.", pid))
    if (any(exec_windows$start_index <= 0)) {
    message("NOTE: Found Trials where the GO marker occurs too late in the data block.")
    }
    return(NULL)
  }
  
  # 5. Center-of-Mass Calculations (RMS is calculated per trial time series)
  final_df <- exec_data %>%
    # 5a. Calculate Time-Series Vector Magnitudes
    mutate(
      COM_pos = sqrt(co_m_pos_x^2 + co_m_pos_y^2 + co_m_pos_z^2),
      COM_vel = sqrt(co_m_vel_x^2 + co_m_vel_y^2 + co_m_vel_z^2),
      COM_acc = sqrt(co_m_acc_x^2 + co_m_acc_y^2 + co_m_acc_z^2)
    ) %>%
    group_by(Subject, Block, Trial) %>%
    mutate(
      # 5b. Calculate Trial-Level RMS Summary (RMS values are repeated for all rows in the trial)
      RMS_acc = sqrt(mean(COM_acc^2, na.rm = TRUE)),
      RMS_vel = sqrt(mean(COM_vel^2, na.rm = TRUE)),
      RMS_pos = sqrt(mean(COM_pos^2, na.rm = TRUE)),
      .groups = 'drop'
    ) %>%
    # Select final columns, including the new magnitude columns
    select(
      Subject, Block, Trial, row_in_trial, time_code = "frame", Marker = marker_text, 
      co_m_acc_x, co_m_acc_y, co_m_acc_z, # Keep component columns for Lyapunov
      COM_pos, COM_vel, COM_acc, 
      RMS_acc, RMS_vel, RMS_pos # Keep the trial-level RMS columns
    )
    
  # 6. Write to Files
  # File 1: Time Series Data (Movement Execution Period)
  out_file_ts <- file.path(xsens_exec_dir, sprintf("ID%s_execution_timeseries.xlsx", pid))
  write.xlsx(final_df, out_file_ts)
  message(paste("Successfully wrote time series data to:", out_file_ts))
  
  message(sprintf("Successfully processed and saved all files for Subject %s.", pid))
  return(final_df) # Return the processed data frame
}

# Execution
processed_exec_data <- process_participant_exec(PARTICIPANT_ID)

Xsens and E-Prime Merge

message(paste("Starting merge for Participant ID:", PARTICIPANT_ID))

execution_list <- list.files(
  path       = xsens_exec_dir, # Look in the specific ID directory
  pattern    = sprintf("^ID%s_execution_timeseries\\.xlsx$", PARTICIPANT_ID),
  full.names = TRUE
)

if (length(execution_list) == 0) {
  stop(sprintf("Execution time-series file for ID %s not found in %s.", PARTICIPANT_ID, xsens_exec_dir))
}

# Read and format the Xsens time-series data
execution_df <- read_excel(execution_list[1]) %>%
  mutate(
    # Convert merge keys to factors for consistency
    Subject = as.factor(Subject),
    Block   = factor(Block, levels = as.character(1:5)), # Assuming max 5 blocks
    Trial   = factor(Trial, levels = as.character(1:48)), # Assuming max 48 trials
    time_code = as.character(time_code), 
    # Ensure all CoM columns are numeric
    across(starts_with("COM_"), as.numeric),
    # Ensure all RMS columns are numeric
    across(starts_with("RMS_"), as.numeric)
  )

# 2. Import and simplify RT data (E-Prime)
rt_data <- read_excel(eprime_clean_file) %>%
    # Filter only for the current participant
  filter(subject == PARTICIPANT_ID) %>%
  filter(!is.na(sequence)) %>%
  filter(!is.na(trial.RTS)) %>%
  mutate(
    Subject = as.factor(subject),
    Block   = factor(session, levels = as.character(1:5)),
    Trial   = factor(trial, levels = as.character(1:48)),
    Sequence = as.factor(sequence),
    Trial_Acc = as.numeric(trial.acc)
  ) %>%
  # Keep one row per trial, preserving all sequence info columns (assuming 'Sequence' is in the RT data)
  distinct(Subject, Block, Trial, .keep_all = TRUE) %>%
  
  # Select the required merge keys PLUS the Sequence column
  select(Subject, Block, Trial, Sequence, trial.RTS, Trial_Acc)

# 3. Merge Xsens time-series with RT sequence data
# This merges all the Xsens time points with the single row of sequence data for that trial.
# This results in the final, merged sequence data needed for Lyapunov analysis.
execution_merged <- execution_df %>%
  left_join(rt_data, by = c("Subject", "Block", "Trial")) %>%
  filter(!is.na(Sequence)) %>%
  # Select final columns, including the time series data and the trial-level RMS/RT/Sequence data
  # The analysis script (load_data_fix.R) relies on these being present:
  select(
    Subject, Block, Trial, Sequence, row_in_trial, time_code, Marker, 
    trial.RTS, Trial_Acc, COM_acc,
    RMS_acc, RMS_vel, RMS_pos
  )
  
# 4. Write out the merged file
out_file_merged <- file.path(xsens_exec_dir, sprintf("ID%s_execution_merged.xlsx", PARTICIPANT_ID))

write.xlsx(execution_merged, out_file_merged)

message(paste("Successfully merged Xsens time-series with RT data."))
message(paste("File saved to:", out_file_merged))