Replay data read_in

Helper on read_in data:

Read_replay_expand will expand all the meta data in the details column expanded

Read_replay will not expand all the meta data

# Import required libraries
library(readr)
library(jsonlite)
library(tidyr)
library(tidyverse)
library(parallel)
library(dplyr)

# Function to read a replay from a file and expand the 'Details' column into multiple columns
read_replay_expand <- function(file_path) {
  # Read data from file as a table
  data <- read_tsv(file_path, col_types = cols(.default = "c")) 

  # Transform the 'Details' column from JSON format to a list of R objects
  data$Details <- lapply(data$Details, function(x) as.list(fromJSON(x, simplifyVector = TRUE)))

  # Expand the 'Details' column into multiple columns
  data <- data %>% unnest_wider(Details)

  return(data)
}

# Function to read a replay from a file without expanding the 'Details' column
read_replay <- function(file_path){
  data <- read_tsv(file_path, col_types = cols(.default = "c")) 
  return(data)
}

# Function to get the closest or latest timestamp in a dataframe compared to a given timestamp
get_closest_or_latest <- function(df, reward_time) {
  # If there is a 'Reward' action
  if (!is.na(reward_time)) {
    df <- df %>% 
      filter(Timestamp <= reward_time) %>% # Filter rows with 'Timestamp' <= 'reward_time'
      arrange(desc(Timestamp)) %>% # Sort in descending order by 'Timestamp'
      slice(1) # Select the first row
  } else {
    df <- df %>% 
      arrange(desc(Timestamp)) %>% # Sort in descending order by 'Timestamp'
      slice(1) # Select the first row
  }
  return(df)
}

# Function to extract pilot information from a replay file
pilot_info_cTS <- function(file_path) {
  # Read data from the replay file
  df <-
    read_tsv(file_path,
             col_types = cols(.default = "c"))
  
  # Convert 'Timestamp' column to date-time format
  df$Timestamp <-
    as.POSIXct(df$Timestamp, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
  
  # Extract unique MOOClet IDs and context keys from the data
  mooclet_ids <-
    unique((df %>% filter(!is.na(`MOOClet ID`)))$`MOOClet ID`)
  context_keys <-
    unique((df[grepl('Context', (df$Action)),] %>%  filter(is.na(`MOOClet ID`) != TRUE))$Key)
  
  # Initialize a dataframe to store the result
  result_df <- data.frame()
  
  # Loop through each MOOClet ID
  for (mooclet_id in mooclet_ids) {
    # Subset the data for the current MOOClet ID
    dt <- df %>% filter(`MOOClet ID` == mooclet_id)
    
    reward <- NA
    reward_time <- NA
    reward_key <- NA
    arm <- NA
    arm_time <- NA
    arm_key <- NA
    
    # Get the time of the 'Arm' action if it exists
    if (any(grepl('Arm', dt$Action))) {
      arm_row <- dt[grepl('Arm', dt$Action),] %>%
        arrange(desc(Timestamp)) %>%
        slice(1)
      
      arm_time <- arm_row$Timestamp
      arm_key <- arm_row$Key
      arm <- arm_row$Value
    }
    
    # Get the time of the 'Reward' action if it exists
    if (any(grepl('Reward', dt$Action))) {
      reward_rows <- dt[grepl('Reward', dt$Action),] %>%
        arrange(desc(Timestamp))
      
      # todo, filter on the descending order of reward
      if (nrow(reward_rows %>% filter(Timestamp < arm_time)) > 0) {
        reward_row <-
          reward_rows %>% filter(Timestamp < arm_time) %>%  slice(1)
        result_df <- bind_rows(
          result_df,
          data.frame(
            Identifier = reward_row$Identifier,
            `MOOClet ID` = reward_row$`MOOClet ID`,
            reward_time = reward_row$Timestamp,
            reward_key = reward_row$Key,
            reward = reward_row$Value,
            check.names = FALSE
          )
        )
      }
      
      reward_row <- reward_rows %>% slice(1)
      reward_time <- reward_row$Timestamp
      reward_key <- reward_row$Key
      reward <- reward_row$Value
    }
    
    for (key in context_keys) {
      context_dt <- dt %>% filter(Key == key)
      
      # If there are any rows for the current context key
      if (nrow(context_dt) > 0) {
        # Get the row with the closest or latest timestamp to 'reward_time'
        context_dt <- get_closest_or_latest(context_dt, arm_time)
        
        # Add reward and arm information to the row
        context_dt$reward <- reward
        context_dt$reward_time <- reward_time
        context_dt$reward_key <- reward_key
        context_dt$arm <- arm
        context_dt$arm_time <- arm_time
        context_dt$arm_key <- arm_key
        
        # Combine Timestamp, Value, Details into a single JSON object in a new column 'context_info'
        context_dt <- context_dt %>%
          mutate(context_info = as.character(toJSON(
            list(
              Timestamp = Timestamp,
              Value = Value,
              Details = Details
            )
          )))
        
        # Drop the original 'Timestamp', 'Value', 'Details' columns
        context_dt <-
          context_dt %>% select(-c(Timestamp, Value, Details))
        
        # Add the row to the result dataframe
        result_df <- bind_rows(result_df, context_dt)
      }
    }
    
    # Pivot the result dataframe to have one column per context key
    result_df_wider <- result_df %>%
      pivot_wider(names_from = Key, values_from = context_info) |>
      select(-Action)
    
    result_df_wider <- result_df_wider %>%
      #filter(!is.na(arm)) %>%
      mutate(Date = as.Date(arm_time, tz = "UTC")) %>%
      relocate(Date, .after = `MOOClet ID`)
  }
  
  return(result_df_wider)
}

# load one day of data from its .zip file 
load_contextual_day <- function(zip_path) {
  file_list <- unzip(zip_path)
  # Get a list of all .txt files in the directory
  final <- data.frame()
  for (file in file_list) {
    tidy_data <- pilot_info_cTS(file)
    final <- bind_rows(final, tidy_data)
  }
  #final <- arrange(final) # here need to add the columne name which is the timestamp name
  return (final)
} 


# load everything in a path, and cross map reward:

# first we need a helper on updating the reward function, i.e. pulling a reward row from the resulting dataframe that has no arm assignment and match it with the nearest record with the same Mooclet ID

update_reward <- function(df, reward_row) {
  id <- reward_row$`MOOClet ID`
  person_id <- reward_row$Identifier
  reward_time1 <- reward_row$reward_time
  reward_key1 <- reward_row$reward_key
  reward1 <- reward_row$reward
  
  to_change <- df %>% filter(`MOOClet ID` == id,
                             Identifier == person_id,
                             arm_time < reward_time1) %>%
    arrange(desc(arm_time)) %>%
    slice(1)
  if (nrow(to_change) == 1) {
    if (is.na(to_change$reward) == TRUE) {
      row_index <- which(df$`MOOClet ID` == id &
                           df$Identifier == person_id &
                           df$Timestamp < reward_time1)
      
      df[row_index, "reward_time"] <- reward_time1
      df[row_index, "reward_key"] <- reward_key1
      df[row_index, "reward"] <- reward1
    }
  }
  
  return(df) # Returning the updated dataframe
}


load_all_contextual_data <- function(dir_path) {
  # Get a list of all .zip files in the directory
  zip_files <-
    list.files(dir_path, pattern = "*.zip", full.names = TRUE)
  
  # Initialize a dataframe to store the final result
  final <- data.frame()
  
  # Loop through each .zip file
  for (zip_file in zip_files) {
    # Apply the 'load_contextual_day' function to the .zip file
    tidy_data <- load_contextual_day(zip_file)
    
    # Add the result to the final dataframe
    if (nrow(final) == 0) {
      final <- tidy_data
    } else {
      final <- bind_rows(final, tidy_data)
    }
  }
  
  reward_rows <-
    final %>% filter(is.na(reward) == FALSE & is.na(arm) == TRUE)
  
  for (i in 1:nrow(reward_rows)) {
    reward_row <- reward_rows[i, ]
    final <- update_reward(final, reward_row)
  }
  final_df <- final %>%  filter(!is.na(arm))
  return(final_df)
}

#demo, remember to put everything in the same folder and set working directory to be the curret folder by going to session -> set working directory -> to source file location
#da <- pilot_info_cTS('R34-27566589__2023-06-20_2023-06-21.txt')

# demo for load_contextual_day

#db <- load_contextual_day('/Users/haochensong/Desktop/mooclet_replay_events/data/MHA_1.zip')

#demo for load_all_contextual_data

#dc <- load_all_contextual_data('/Users/haochensong/Desktop/mooclet_replay_events/data/')

#dd <- dc %>%filter(`MOOClet ID` == 586)