# Import required libraries
library(readr)
library(jsonlite)
library(tidyr)
library(tidyverse)
library(parallel)
library(dplyr)
# Function to read a replay from a file and expand the 'Details' column into multiple columns
read_replay_expand <- function(file_path) {
# Read data from file as a table
data <- read_tsv(file_path, col_types = cols(.default = "c"))
# Transform the 'Details' column from JSON format to a list of R objects
data$Details <- lapply(data$Details, function(x) as.list(fromJSON(x, simplifyVector = TRUE)))
# Expand the 'Details' column into multiple columns
data <- data %>% unnest_wider(Details)
return(data)
}
# Function to read a replay from a file without expanding the 'Details' column
read_replay <- function(file_path){
data <- read_tsv(file_path, col_types = cols(.default = "c"))
return(data)
}
# Function to get the closest or latest timestamp in a dataframe compared to a given timestamp
get_closest_or_latest <- function(df, reward_time) {
# If there is a 'Reward' action
if (!is.na(reward_time)) {
df <- df %>%
filter(Timestamp <= reward_time) %>% # Filter rows with 'Timestamp' <= 'reward_time'
arrange(desc(Timestamp)) %>% # Sort in descending order by 'Timestamp'
slice(1) # Select the first row
} else {
df <- df %>%
arrange(desc(Timestamp)) %>% # Sort in descending order by 'Timestamp'
slice(1) # Select the first row
}
return(df)
}
# Function to extract pilot information from a replay file
pilot_info_cTS <- function(file_path) {
# Read data from the replay file
df <-
read_tsv(file_path,
col_types = cols(.default = "c"))
# Convert 'Timestamp' column to date-time format
df$Timestamp <-
as.POSIXct(df$Timestamp, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
# Extract unique MOOClet IDs and context keys from the data
mooclet_ids <-
unique((df %>% filter(!is.na(`MOOClet ID`)))$`MOOClet ID`)
context_keys <-
unique((df[grepl('Context', (df$Action)),] %>% filter(is.na(`MOOClet ID`) != TRUE))$Key)
# Initialize a dataframe to store the result
result_df <- data.frame()
# Loop through each MOOClet ID
for (mooclet_id in mooclet_ids) {
# Subset the data for the current MOOClet ID
dt <- df %>% filter(`MOOClet ID` == mooclet_id)
reward <- NA
reward_time <- NA
reward_key <- NA
arm <- NA
arm_time <- NA
arm_key <- NA
# Get the time of the 'Arm' action if it exists
if (any(grepl('Arm', dt$Action))) {
arm_row <- dt[grepl('Arm', dt$Action),] %>%
arrange(desc(Timestamp)) %>%
slice(1)
arm_time <- arm_row$Timestamp
arm_key <- arm_row$Key
arm <- arm_row$Value
}
# Get the time of the 'Reward' action if it exists
if (any(grepl('Reward', dt$Action))) {
reward_rows <- dt[grepl('Reward', dt$Action),] %>%
arrange(desc(Timestamp))
# todo, filter on the descending order of reward
if (nrow(reward_rows %>% filter(Timestamp < arm_time)) > 0) {
reward_row <-
reward_rows %>% filter(Timestamp < arm_time) %>% slice(1)
result_df <- bind_rows(
result_df,
data.frame(
Identifier = reward_row$Identifier,
`MOOClet ID` = reward_row$`MOOClet ID`,
reward_time = reward_row$Timestamp,
reward_key = reward_row$Key,
reward = reward_row$Value,
check.names = FALSE
)
)
}
reward_row <- reward_rows %>% slice(1)
reward_time <- reward_row$Timestamp
reward_key <- reward_row$Key
reward <- reward_row$Value
}
for (key in context_keys) {
context_dt <- dt %>% filter(Key == key)
# If there are any rows for the current context key
if (nrow(context_dt) > 0) {
# Get the row with the closest or latest timestamp to 'reward_time'
context_dt <- get_closest_or_latest(context_dt, arm_time)
# Add reward and arm information to the row
context_dt$reward <- reward
context_dt$reward_time <- reward_time
context_dt$reward_key <- reward_key
context_dt$arm <- arm
context_dt$arm_time <- arm_time
context_dt$arm_key <- arm_key
# Combine Timestamp, Value, Details into a single JSON object in a new column 'context_info'
context_dt <- context_dt %>%
mutate(context_info = as.character(toJSON(
list(
Timestamp = Timestamp,
Value = Value,
Details = Details
)
)))
# Drop the original 'Timestamp', 'Value', 'Details' columns
context_dt <-
context_dt %>% select(-c(Timestamp, Value, Details))
# Add the row to the result dataframe
result_df <- bind_rows(result_df, context_dt)
}
}
# Pivot the result dataframe to have one column per context key
result_df_wider <- result_df %>%
pivot_wider(names_from = Key, values_from = context_info) |>
select(-Action)
result_df_wider <- result_df_wider %>%
#filter(!is.na(arm)) %>%
mutate(Date = as.Date(arm_time, tz = "UTC")) %>%
relocate(Date, .after = `MOOClet ID`)
}
return(result_df_wider)
}
# load one day of data from its .zip file
load_contextual_day <- function(zip_path) {
file_list <- unzip(zip_path)
# Get a list of all .txt files in the directory
final <- data.frame()
for (file in file_list) {
tidy_data <- pilot_info_cTS(file)
final <- bind_rows(final, tidy_data)
}
#final <- arrange(final) # here need to add the columne name which is the timestamp name
return (final)
}
# load everything in a path, and cross map reward:
# first we need a helper on updating the reward function, i.e. pulling a reward row from the resulting dataframe that has no arm assignment and match it with the nearest record with the same Mooclet ID
update_reward <- function(df, reward_row) {
id <- reward_row$`MOOClet ID`
person_id <- reward_row$Identifier
reward_time1 <- reward_row$reward_time
reward_key1 <- reward_row$reward_key
reward1 <- reward_row$reward
to_change <- df %>% filter(`MOOClet ID` == id,
Identifier == person_id,
arm_time < reward_time1) %>%
arrange(desc(arm_time)) %>%
slice(1)
if (nrow(to_change) == 1) {
if (is.na(to_change$reward) == TRUE) {
row_index <- which(df$`MOOClet ID` == id &
df$Identifier == person_id &
df$Timestamp < reward_time1)
df[row_index, "reward_time"] <- reward_time1
df[row_index, "reward_key"] <- reward_key1
df[row_index, "reward"] <- reward1
}
}
return(df) # Returning the updated dataframe
}
load_all_contextual_data <- function(dir_path) {
# Get a list of all .zip files in the directory
zip_files <-
list.files(dir_path, pattern = "*.zip", full.names = TRUE)
# Initialize a dataframe to store the final result
final <- data.frame()
# Loop through each .zip file
for (zip_file in zip_files) {
# Apply the 'load_contextual_day' function to the .zip file
tidy_data <- load_contextual_day(zip_file)
# Add the result to the final dataframe
if (nrow(final) == 0) {
final <- tidy_data
} else {
final <- bind_rows(final, tidy_data)
}
}
reward_rows <-
final %>% filter(is.na(reward) == FALSE & is.na(arm) == TRUE)
for (i in 1:nrow(reward_rows)) {
reward_row <- reward_rows[i, ]
final <- update_reward(final, reward_row)
}
final_df <- final %>% filter(!is.na(arm))
return(final_df)
}Replay data read_in
Helper on read_in data:
Read_replay_expand will expand all the meta data in the details column expanded
Read_replay will not expand all the meta data
#demo, remember to put everything in the same folder and set working directory to be the curret folder by going to session -> set working directory -> to source file location
#da <- pilot_info_cTS('R34-27566589__2023-06-20_2023-06-21.txt')# demo for load_contextual_day
#db <- load_contextual_day('/Users/haochensong/Desktop/mooclet_replay_events/data/MHA_1.zip')#demo for load_all_contextual_data
#dc <- load_all_contextual_data('/Users/haochensong/Desktop/mooclet_replay_events/data/')#dd <- dc %>%filter(`MOOClet ID` == 586)