library(readxl)
library(dplyr)
library(tidyverse)
library(openxlsx)
library(stringr)
library(purrr)
library(janitor)
library(tidyr)

Define directories and file path

# Base directory for all data
base_eprime_dir <- "/Users/vivi/UT/Thesis/Data/E-Prime"
eprime_raw_file <- file.path(base_eprime_dir, "all_excluded2.csv")
eprime_processed_dir <- file.path(base_eprime_dir)

# Ensure the output directory exists
if (!dir.exists(eprime_processed_dir)) {
  dir.create(eprime_processed_dir, recursive = TRUE)
  message(paste("Created E-Prime output directory:", eprime_processed_dir))
}

E-Prime Data Preprocessing

# Load E-Prime Data
A_B <- read.csv(eprime_raw_file, sep = ";")
A_B_R <- A_B %>% filter(procedure == "responsprocedure")

# Calculate Trial Numbers, Accuracy, and RT
eprime_df_cleaned <- A_B_R %>%
# 1. Prep numeric conversions needed for subsequent steps
  mutate(
    feedback.ACC = as.numeric(feedback.ACC),
    feedback.RT = as.numeric(feedback.RT),
    # Ensure sub.trial.number is numeric for max() calculation
    sub.trial.number = as.numeric(sub.trial.number) 
  ) %>%
  
  # 2. Trial Numbering (Define 'trial' column)
  group_by(session, subject) %>%
  mutate(
    # ASSIGN TRIAL NUMBER: Uses the assumption that sub.trial.number == 1 indicates the start of a new trial.
    trial = cumsum(sub.trial.number == 1)
  ) %>%
  ungroup() %>%
  
  # 3. Calculate trial-level metrics, and DETERMINE Sequence
  group_by(subject, session, trial) %>%
  summarise(
    # Trial-level metrics
    trial.acc = sum(feedback.ACC, na.rm = TRUE) / n(),
    trial.RT = mean(feedback.RT, na.rm = TRUE),
    
    # CRITICAL FIX: Determine sequence length based on session type
    sequence = case_when(
        session == 1 ~ 6,
        session == 2 ~ 12,
        session == 3 ~ 18,
        # For sessions 4 and 5, the length is the maximum value of sub.trial.number in this group
        session %in% c(4, 5) ~ max(sub.trial.number),
        TRUE ~ NA_real_ # Should not happen for sessions 1-5
    ),

    .groups = "drop"
  ) %>%
  
  # 4. Convert RT to seconds and flag bad trials
  mutate(trial.RTS = trial.RT / 1000) %>%
  filter(trial.acc >= 0.8 & !is.na(trial.RTS)) %>%

  # Final cleanup and selection
  select(subject, session, trial, sequence, trial.acc, trial.RTS)


# Check for any remaining NA sequences before saving (for debugging)
if (any(is.na(eprime_df_cleaned$sequence))) {
  warning("WARNING: Not all rows have a defined sequence length after processing. Check Session mapping.")
}

# Save the cleaned E-Prime data for the participant
out_file_eprime <- file.path(eprime_processed_dir, "rt_cleaned_merged.xlsx")
write.xlsx(eprime_df_cleaned, file = out_file_eprime, row.names = FALSE)

message(paste("E-Prime data cleaned and saved:", out_file_eprime))