Activity annotations validation

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.2.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(here)

here() starts at /Users/vislearnlab/Documents/contexts

library(glue)
library(tidytext)
library(irr)

Loading required package: lpSolve

library(psych)


Attaching package: 'psych'

The following objects are masked from 'package:ggplot2':

    %+%, alpha

ann_1 <- read.csv(here("data/completed_annotations/DD_list_final.csv"))
ann_2 <- read.csv(here("data/completed_annotations/JY_list_final.csv"))
ann_3 <- read.csv(here("data/completed_annotations/VM_list_final.csv"))
df_cleaned <- read.csv(here("data/all_contexts_cleaned.csv"))

helpers

calculate_f_score <- function(predicted, actual) {
  # Convert to character and trim strings
  predicted <- str_trim(as.character(predicted))
  actual <- str_trim(as.character(actual))
  
  # Remove NA values
  valid_indices <- !is.na(predicted) & !is.na(actual)
  predicted <- predicted[valid_indices]
  actual <- actual[valid_indices]
  
  if(length(predicted) == 0 || length(actual) == 0) {
    return(list(precision = NA, recall = NA, f1 = NA))
  }
  
  # Get unique labels
  all_labels <- unique(c(predicted, actual))
  
  # Calculate population-level metrics across all labels
  total_tp <- 0
  total_fp <- 0
  total_fn <- 0
  total_f1s <- c()
  for(label in all_labels) {
    tp <- sum(predicted == label & actual == label)
    fp <- sum(predicted == label & actual != label)
    fn <- sum(predicted != label & actual == label)
    precision <- ifelse(tp + fp == 0, 0, tp / (tp + fp))
    recall <- ifelse(tp + fn == 0, 0, tp / (tp + fn))
    f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
    if ((tp+fp)>=5) {
      total_f1s <- c(total_f1s, f1)
    }
    total_tp <- total_tp + tp
    total_fp <- total_fp + fp
    total_fn <- total_fn + fn
  }
  
  # Population-level precision, recall, and F1
  precision <- ifelse(total_tp + total_fp == 0, 0, total_tp / (total_tp + total_fp))
  recall <- ifelse(total_tp + total_fn == 0, 0, total_tp / (total_tp + total_fn))
  f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
  #print(paste0("F1 at category level", mean(total_f1s)))
  return(list(precision = precision, recall = recall, f1 = f1))
}

main IRR and precision calculations across raters

# Step 1: Add annotator ID and combine all annotation files (including Other columns)
ann_1_labeled <- ann_1 %>%
  mutate(annotator = "DD") %>%
  dplyr::select(video_id, Location, Activity, Other.locations, Other.activities, 
         Video.description.accuracy..1.5., annotator)

ann_2_labeled <- ann_2 %>%
  mutate(annotator = "JY") %>%
  dplyr::select(video_id, Location, Activity, Other.locations, Other.activities, 
         Video.description.accuracy..1.5., annotator)

ann_3_labeled <- ann_3 %>%
  mutate(annotator = "VM") %>%
  dplyr::select(video_id, Location, Activity, Other.locations, Other.activities, 
         Video.description.accuracy..1.5., annotator)

# Combine all annotations
all_annotations <- bind_rows(ann_1_labeled, ann_2_labeled, ann_3_labeled)

# Step 2: Filter for rows where Video.description.accuracy is not empty for all three annotators
complete_ratings <- all_annotations %>%
  filter(!is.na(Video.description.accuracy..1.5.) & 
         Video.description.accuracy..1.5. != "" &
         !is.null(Video.description.accuracy..1.5.)) %>%
  group_by(video_id) %>%
  filter(n() == 3) %>%  # Only keep video_ids that have all 3 annotations
  ungroup()

# Get the video_ids that have complete ratings
complete_video_ids <- unique(complete_ratings$video_id)
clean_video_ids <- sub("^\\d+_", "", complete_video_ids)

print(paste("Number of videos with complete ratings from all three annotators:", length(clean_video_ids)))

[1] "Number of videos with complete ratings from all three annotators: 100"

# Step 3: Find matching rows in df_cleaned
df_matched <- df_cleaned %>%
  filter(video_id %in% clean_video_ids)

print(paste("Number of rows in df_cleaned matching complete annotations:", nrow(df_matched)))

[1] "Number of rows in df_cleaned matching complete annotations: 100"

# Step 4: Prepare data for inter-rater reliability analysis
# Pivot wider to have one row per video_id with columns for each annotator
location_wide <- complete_ratings %>%
  dplyr::select(video_id, annotator, Location) %>%
  pivot_wider(names_from = annotator, values_from = Location, names_prefix = "Location_")

activity_wide <- complete_ratings %>%
  dplyr::select(video_id, annotator, Activity, Other.activities) %>%
  pivot_wider(
    names_from = annotator,
    values_from = c(Activity, Other.activities),
    names_sep = "_"
  )

# Video description accuracy ratings
accuracy_wide <- complete_ratings %>%
  select(video_id, annotator, Video.description.accuracy..1.5.) %>%
  mutate(Video.description.accuracy..1.5. = as.numeric(Video.description.accuracy..1.5.)) %>%
  pivot_wider(names_from = annotator, values_from = Video.description.accuracy..1.5., 
              names_prefix = "Accuracy_")

print("=== INTER-RATER RELIABILITY ANALYSIS ===")

[1] "=== INTER-RATER RELIABILITY ANALYSIS ==="

# For Location
if(all(c("Location_DD", "Location_JY", "Location_VM") %in% colnames(location_wide))) {
  location_matrix <- location_wide %>%
    select(Location_DD, Location_JY, Location_VM) %>%
    as.matrix()
  
  # Calculate Fleiss' Kappa for Location 
  fleiss_kappa <- kappam.fleiss(location_matrix)
  kripp_alpha <- kripp.alpha(t(location_matrix), method="nominal")
  print(paste("Location - Fleiss Kappa:", round(fleiss_kappa$value, 3)))
  # Calculate agreement percentages
  location_agreement_DD_JY <- mean(location_matrix[,1] == location_matrix[,2], na.rm = TRUE)
  location_agreement_DD_VM <- mean(location_matrix[,1] == location_matrix[,3], na.rm = TRUE)
  location_agreement_JY_VM <- mean(location_matrix[,2] == location_matrix[,3], na.rm = TRUE)
  print(paste("Location Agreement DD-JY:", round(location_agreement_DD_JY * 100, 1), "%"))
  print(paste("Location Agreement DD-VM:", round(location_agreement_DD_VM * 100, 1), "%"))
  print(paste("Location Agreement JY-VM:", round(location_agreement_JY_VM * 100, 1), "%"))
  print(paste("Location Kripp alpha:", round(kripp_alpha$value, 3)))
}

Warning in kripp.alpha(t(location_matrix), method = "nominal"): NAs introduced
by coercion

[1] "Location - Fleiss Kappa: 0.565"
[1] "Location Agreement DD-JY: 72 %"
[1] "Location Agreement DD-VM: 59 %"
[1] "Location Agreement JY-VM: 59 %"
[1] "Location Kripp alpha: 0.566"

# For Activity
if(all(c("Activity_DD", "Activity_JY", "Activity_VM") %in% colnames(activity_wide))) {
  activity_matrix <- activity_wide %>%
    select(Activity_DD, Activity_JY, Activity_VM) %>%
    as.matrix()
  
  # Calculate agreement percentages
      activity_kappa <- kappa2(activity_matrix[,1:2])
      fleiss_kappa <- kappam.fleiss(activity_matrix)
      kripp_alpha <- kripp.alpha(t(activity_matrix), method="nominal")
  print(paste("Activity - Cohen's Kappa (DD vs JY):", round(activity_kappa$value, 3)))
  activity_kappa_2 <- kappa2(activity_matrix[,c(1,3)])
  print(paste("Activity - Cohen's Kappa (DD vs VM):", round(activity_kappa_2$value, 3)))
  activity_kappa_3 <- kappa2(activity_matrix[,2:3])
  print(paste("Activity - Cohen's Kappa (VM vs JY):", round(activity_kappa_3$value, 3)))
  print(paste("Mean Cohen's:", mean(activity_kappa$value, activity_kappa_2$value, activity_kappa_3$value)))
  print(paste("Activity - Fleiss Kappa:", round(fleiss_kappa$value, 3)))
  print(paste("Activity - Kripp Alpha:", round(kripp_alpha$value, 3)))
}

Warning in kripp.alpha(t(activity_matrix), method = "nominal"): NAs introduced
by coercion

[1] "Activity - Cohen's Kappa (DD vs JY): 0.458"
[1] "Activity - Cohen's Kappa (DD vs VM): 0.508"
[1] "Activity - Cohen's Kappa (VM vs JY): 0.378"
[1] "Mean Cohen's: 0.457688396791323"
[1] "Activity - Fleiss Kappa: 0.443"
[1] "Activity - Kripp Alpha: 0.444"

print("\n=== VIDEO DESCRIPTION ACCURACY RATINGS ===")

[1] "\n=== VIDEO DESCRIPTION ACCURACY RATINGS ==="

if(all(c("Accuracy_DD", "Accuracy_JY", "Accuracy_VM") %in% colnames(accuracy_wide))) {
  accuracy_matrix <- accuracy_wide %>%
    select(Accuracy_DD, Accuracy_JY, Accuracy_VM) %>%
    as.matrix()

  # Calculate means and standard deviations for each annotator
  for(annotator in c("DD", "JY", "VM")) {
    col_name <- paste0("Accuracy_", annotator)
    ratings <- accuracy_wide[[col_name]]
    mean_rating <- mean(ratings, na.rm = TRUE)
    sd_rating <- sd(ratings, na.rm = TRUE)
    print(paste(paste0("Annotator ", annotator, " - Mean:"), round(mean_rating, 2), 
                "SD:", round(sd_rating, 2)))
  }
  
  # Overall statistics across all ratings
  all_ratings <- c(accuracy_matrix)
  all_ratings <- all_ratings[!is.na(all_ratings)]
  print(paste("Overall Mean Rating:", round(mean(all_ratings), 2)))
  print(paste("Overall SD Rating:", round(sd(all_ratings), 2)))
}

[1] "Annotator DD - Mean: 3.75 SD: 1.23"
[1] "Annotator JY - Mean: 3.19 SD: 1.28"
[1] "Annotator VM - Mean: 2.88 SD: 1.45"
[1] "Overall Mean Rating: 3.27"
[1] "Overall SD Rating: 1.37"

# Step 6: Functions for inter-rater reliability calculations

# Function to calculate 3-way agreement percentage
calculate_three_way_agreement <- function(col1, col2, col3) {
  # All three agree
  all_three_agree <- mean(col1 == col2 & col2 == col3, na.rm = TRUE)
  # At least two agree (majority)
  at_least_two_agree <- mean(
    (col1 == col2) | (col1 == col3) | (col2 == col3), 
    na.rm = TRUE
  )
  return(list(all_three = all_three_agree, at_least_two = at_least_two_agree))
}


# Function to calculate set-based agreement (for expanded annotations)
calculate_set_agreement <- function(set1_list, set2_list) {
  if(length(set1_list) != length(set2_list)) {
    return(NA)
  }
  
  agreements <- numeric(length(set1_list))
  
  for(i in seq_along(set1_list)) {
    set1 <- set1_list[[i]]
    set2 <- set2_list[[i]]
    
    # Check if there's any overlap between the sets
    if(length(intersect(set1, set2)) > 0) {
      agreements[i] <- 1
    } else {
      agreements[i] <- 0
    }
  }
  
  return(mean(agreements, na.rm = TRUE))
}

# Function to calculate 3-way set agreement
calculate_three_way_set_agreement <- function(set1_list, set2_list, set3_list) {
  if(length(set1_list) != length(set2_list) || length(set2_list) != length(set3_list)) {
    return(list(all_three = NA, at_least_two = NA))
  }
  
  all_three_agree <- numeric(length(set1_list))
  at_least_two_agree <- numeric(length(set1_list))
  
  for(i in seq_along(set1_list)) {
    set1 <- set1_list[[i]]
    set2 <- set2_list[[i]]
    set3 <- set3_list[[i]]
    
    # Check pairwise overlaps
    overlap_12 <- length(intersect(set1, set2)) > 0
    overlap_13 <- length(intersect(set1, set3)) > 0
    overlap_23 <- length(intersect(set2, set3)) > 0
    
    # All three have some common element
    common_all <- length(intersect(intersect(set1, set2), set3)) > 0
    all_three_agree[i] <- as.numeric(common_all)
    
    # At least two pairs have overlap
    at_least_two_agree[i] <- as.numeric(overlap_12 | overlap_13 | overlap_23)
  }
  
  return(list(
    all_three = mean(all_three_agree, na.rm = TRUE),
    at_least_two = mean(at_least_two_agree, na.rm = TRUE)
  ))
}

# Function to expand annotations with "Other" columns
expand_annotations <- function(primary_col, other_col) {
  # Handle cases where other_col might be NA or empty
  other_col <- ifelse(is.na(other_col) | other_col == "", "", other_col)
  
  # Split other_col by comma and combine with primary
  other_items <- str_split(other_col, ",\\s*") |>
      lapply(str_trim)
  
  # Create expanded list
  expanded <- map2(primary_col, other_items, function(primary, others) {
    if(length(others) == 1 && others == "") {
      return(primary)
    } else {
      return(c(primary, others))
    }
  })
  
  return(expanded)
}

print("\n=== Comparing VQA model to humans ===")

[1] "\n=== Comparing VQA model to humans ==="

# Function to calculate F-score for sets (allowing multiple correct answers)
calculate_f_score_sets <- function(human_list, predicted) {
  if(length(human_list) == 0 || length(predicted) == 0) {
    return(list(precision = NA, recall = NA, f1 = NA))
  }
  
  # Convert actual to character and trim
  actual <- str_trim(as.character(predicted))
  
  # Remove NA values
  valid_indices <- !is.na(predicted)
  human_list <- human_list[valid_indices]
  predicted <- predicted[valid_indices]
  
  if(length(human_list) == 0 || length(predicted) == 0) {
    return(list(precision = NA, recall = NA, f1 = NA))
  }
  
  # Trim predicted sets
  human_list <- lapply(human_list, function(x) str_trim(as.character(x)))
  # Calculate population-level metrics
  total_correct <- 0
  total_predicted <- length(predicted)  # Each instance has exactly one true label
  total_actual <- 0
  for(i in seq_along(human_list)) {
    human_set <- human_list[[i]]
    predicted_label <- predicted[i]
    
    if(length(human_set) > 0) {
      total_actual <- total_actual + length(human_set)
      # Count how many predictions in this set are correct
      total_correct <- total_correct + ifelse(predicted_label %in% human_list, 1, 0)
    }
  }

  # Population-level precision and recall
  precision <- ifelse(total_predicted == 0, 0, total_correct / total_predicted)
  recall <- ifelse(total_actual == 0, 0, total_correct / total_actual)
  f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
  
  return(list(precision = precision, recall = recall, f1 = f1))
}



# Merge annotations with df_cleaned for comparison
comparison_data <- complete_ratings %>%
  mutate(cleaned_video_id = sub("^\\d+_", "", video_id)) %>%
  left_join(df_cleaned %>% select(video_id, Location, Activity), 
            by = c("cleaned_video_id" = "video_id"), suffix = c("_annotation", "_cleaned"))

# Create expanded annotations including "Other" columns
comparison_data_expanded <- comparison_data %>% 
  mutate(
    expanded_locations = expand_annotations(Location_annotation, Other.locations),
    expanded_activities = expand_annotations(Activity_annotation, Other.activities)
  )


# Calculate F-scores for each annotator vs df_cleaned
annotators <- c("DD", "JY", "VM")

location_precision = c()
activity_precision = c()
for(annotator in annotators) {
  annotator_data <- comparison_data %>% filter(annotator == !!annotator)

  
  # Location F-score
  location_f_score <- calculate_f_score(annotator_data$Location_annotation, 
                                      annotator_data$Location_cleaned)
  location_precision = c(location_precision, location_f_score$precision)

  # Activity F-score
  activity_f_score <- calculate_f_score(annotator_data$Activity_annotation, 
                                       annotator_data$Activity_cleaned)
  activity_precision = c(activity_precision, activity_f_score$precision)
}
print(paste0("Mean location precision:", round(mean(location_precision), 3)))

[1] "Mean location precision:0.61"

print(paste0("Mean activity precision:", round(mean(activity_precision), 3)))

[1] "Mean activity precision:0.407"

Splitting postural and non-postural activities

Just checking how many detections we have per activity

comparison_data |> filter(annotator=="DD") |> group_by(Activity_cleaned) |>
  count(Activity_cleaned) |>
  arrange(desc(n))

# A tibble: 20 × 2
# Groups:   Activity_cleaned [20]
   Activity_cleaned      n
   <chr>             <int>
 1 playing              23
 2 eating               12
 3 looking at device    11
 4 walking               8
 5 watching tv           7
 6 crawling              6
 7 reading               6
 8 sitting               5
 9 exploring             4
10 being held            3
11 cleaning              2
12 cooking               2
13 crying                2
14 drinking              2
15 lying down            2
16 conversing            1
17 dancing               1
18 drawing               1
19 music time            1
20 nursing               1

all_posture_activities <- c("being held", "walking", "standing", "sitting", "crawling" , "lying down")

process_one_coder <- function(data, coder) {
  data %>%
    rowwise() %>%
    mutate(
      combined = list(c(.data[[paste0("Activity_", coder)]], 
                       strsplit(.data[[paste0("Other.activities_", coder)]], ", ")[[1]])),
      posture = first(combined[combined %in% all_posture_activities]),
      main = first(combined[!(combined %in% c(all_posture_activities, "other")) & combined != ""]),
      main_clubbed = case_when(
        main == "exploring" ~ "playing",
        main == "other" ~ NA_character_,
        main == "watching tv" ~ "looking at device",
        TRUE ~ main
      )
    ) %>%
    ungroup() %>%
    rename_with(~paste0(., "_", coder), c(combined, posture, main, main_clubbed))
}

activity_separate <- activity_wide %>%
  process_one_coder("DD") %>%
  process_one_coder("JY") %>%
  process_one_coder("VM") %>%
  select(video_id, ends_with("_DD"), ends_with("_JY"), ends_with("_VM"))

# View the result
calculate_irr <- function(matrix_data, label) {
  cat("\n===", label, "===\n")
  
  # Cohen's Kappa for each pair
  kappa_dd_jy <- kappa2(matrix_data[,1:2])
  kappa_dd_vm <- kappa2(matrix_data[,c(1,3)])
  kappa_jy_vm <- kappa2(matrix_data[,2:3])
  
  print(paste("Cohen's Kappa (DD vs JY):", round(kappa_dd_jy$value, 3)))
  print(paste("Cohen's Kappa (DD vs VM):", round(kappa_dd_vm$value, 3)))
  print(paste("Cohen's Kappa (JY vs VM):", round(kappa_jy_vm$value, 3)))
  print(paste("Mean Cohen's Kappa:", round(mean(c(kappa_dd_jy$value, kappa_dd_vm$value, kappa_jy_vm$value)), 3)))
  
  # Fleiss' Kappa
  fleiss <- kappam.fleiss(matrix_data)
  print(paste("Fleiss' Kappa:", round(fleiss$value, 3)))
  
  # Krippendorff's Alpha
  kripp <- kripp.alpha(t(matrix_data), method="nominal")
  print(paste("Krippendorff's Alpha:", round(kripp$value, 3)))
  
  cat("\n")
}

# Posture
posture_matrix <- activity_separate %>%
  dplyr::select(posture_DD, posture_JY, posture_VM) %>%
  as.matrix()
calculate_irr(posture_matrix, "POSTURE")


=== POSTURE ===
[1] "Cohen's Kappa (DD vs JY): 0.504"
[1] "Cohen's Kappa (DD vs VM): 0.489"
[1] "Cohen's Kappa (JY vs VM): 0.501"
[1] "Mean Cohen's Kappa: 0.498"
[1] "Fleiss' Kappa: 0.493"

Warning in kripp.alpha(t(matrix_data), method = "nominal"): NAs introduced by
coercion

[1] "Krippendorff's Alpha: 0.499"

# Main activity
main_matrix <- activity_separate %>%
  dplyr::select(main_DD, main_JY, main_VM) %>%
  as.matrix()
calculate_irr(main_matrix, "MAIN ACTIVITY")


=== MAIN ACTIVITY ===
[1] "Cohen's Kappa (DD vs JY): 0.651"
[1] "Cohen's Kappa (DD vs VM): 0.709"
[1] "Cohen's Kappa (JY vs VM): 0.606"
[1] "Mean Cohen's Kappa: 0.655"
[1] "Fleiss' Kappa: 0.687"

Warning in kripp.alpha(t(matrix_data), method = "nominal"): NAs introduced by
coercion

[1] "Krippendorff's Alpha: 0.631"

# Main activity (clubbed)
main_clubbed_matrix <- activity_separate %>%
  select(main_clubbed_DD, main_clubbed_JY, main_clubbed_VM) %>%
  as.matrix()
calculate_irr(main_clubbed_matrix, "MAIN ACTIVITY (CLUBBED)")


=== MAIN ACTIVITY (CLUBBED) ===
[1] "Cohen's Kappa (DD vs JY): 0.823"
[1] "Cohen's Kappa (DD vs VM): 0.701"
[1] "Cohen's Kappa (JY vs VM): 0.703"
[1] "Mean Cohen's Kappa: 0.743"
[1] "Fleiss' Kappa: 0.756"

Warning in kripp.alpha(t(matrix_data), method = "nominal"): NAs introduced by
coercion

[1] "Krippendorff's Alpha: 0.744"

# Merge activity_separate with ground truth
comparison_activities <- activity_separate %>%
  mutate(cleaned_video_id = sub("^\\d+_", "", video_id)) %>%
  left_join(df_cleaned %>% select(video_id, Activity), 
            by = c("cleaned_video_id" = "video_id"))


comparison_activities <- comparison_activities %>%
  mutate(
    vqa_posture = ifelse(Activity %in% all_posture_activities, Activity, NA),
    vqa_main = ifelse(!(Activity %in% all_posture_activities), Activity, NA)
  )

# Calculate F-scores
annotators <- c("DD", "JY", "VM")

mean_precision = c()
for(annotator in annotators) {
  data_posture <- comparison_activities %>%
    filter(!is.na(vqa_posture))
  
  predicted <- data_posture[[paste0("posture_", annotator)]]
  actual <- data_posture$vqa_posture
  
  f_score <- calculate_f_score(predicted, actual)
  
  # cat("\n--- Annotator:", annotator, "---\n")
  #print(paste("Posture Precision:", round(f_score$precision, 3)))
   mean_precision = c(mean_precision, f_score$precision)
}
print(paste("Mean posture activity precision:", round(mean(mean_precision), 3)))

[1] "Mean posture activity precision: 0.507"

mean_precision = c()
for(annotator in annotators) {
  data_main <- comparison_activities %>%
    filter(!is.na(vqa_main))
  
  predicted <- data_main[[paste0("main_", annotator)]]
  actual <- data_main$vqa_main
  
  f_score <- calculate_f_score(predicted, actual)
  
  #cat("\n--- Annotator:", annotator, "---\n")
 #print(paste("Main Activity Precision:", round(f_score$precision, 3)))
  mean_precision = c(mean_precision, f_score$precision)
}
 print(paste("Mean main Activity Precision:", round(mean(mean_precision), 3)))

[1] "Mean main Activity Precision: 0.667"

 mean_precision = c()
for(annotator in annotators) {
 data_main <- comparison_activities %>%
  filter(!is.na(vqa_main)) %>%
  mutate(
    vqa_main = case_when(
      vqa_main == "exploring"    ~ "playing",
      vqa_main == "other"        ~ NA_character_,
      vqa_main == "watching tv"  ~ "looking at device",
      TRUE                                ~ vqa_main
    )
  )
  
  predicted <- data_main[[paste0("main_clubbed_", annotator)]]
  actual <- data_main$vqa_main
  
  f_score <- calculate_f_score(predicted, actual)
  
  #cat("\n--- Annotator:", annotator, "---\n")
  #print(paste("Main Clubbed Precision:", round(f_score$precision, 3)))
   mean_precision = c(mean_precision, f_score$precision)
}
 print(paste("Mean main clubbed activity Precision:", round(mean(mean_precision), 3)))

[1] "Mean main clubbed activity Precision: 0.744"

Confusion matrices

library(ggplot2)
library(dplyr)
library(tidyr)

# Helper function to create all pairwise comparisons
create_pairwise_comparisons <- function(data, columns) {
  expand.grid(col1 = columns, col2 = columns, stringsAsFactors = FALSE) %>%
    filter(col1 != col2) %>%
    rowwise() %>%
    do({
      data.frame(
        actual = data[[.$col1]],
        predicted = data[[.$col2]]
      )
    }) %>%
    bind_rows()
}

# Function to create confusion matrix with counts
create_confusion_data <- function(data) {
  data %>%
    filter(!is.na(predicted) & !is.na(actual)) %>%
    group_by(actual, predicted) %>%
    summarise(count = n(), .groups = "drop")
}

# Function to add proportions and PMI with Laplace smoothing
add_proportions_and_pmi <- function(confusion_data, all_activities, laplace = 1, min_count = 3) {
  # Ensure all activities are present in both dimensions
  confusion_complete <- confusion_data %>%
    complete(
      actual = all_activities, 
      predicted = all_activities, 
      fill = list(count = 0)
    ) %>%
    mutate(count_smoothed = count + laplace)
  
  # Calculate proportions and PMI
  total <- sum(confusion_complete$count_smoothed)
  
 result <- confusion_complete %>%
  group_by(actual) %>%
  mutate(
    row_total = sum(count_smoothed),
    prop = count_smoothed / row_total,
    keep_row = sum(count) >= min_count
  ) %>%
  ungroup() %>%
  filter(!predicted %in% actual[!keep_row]) %>%
  group_by(predicted) %>%
  mutate(col_total = sum(count_smoothed)) %>%
  ungroup() %>%
  mutate(
    p_actual = row_total / total,
    p_predicted = col_total / total,
    p_joint = count_smoothed / total,
    pmi = log2(p_joint / (p_actual * p_predicted))
  ) %>%
  filter(keep_row) %>%
  select(actual, predicted, count, prop, pmi)
 return(result)
}

# Function to order by max PMI
order_by_max_pmi <- function(confusion_data) {
  # Get ordering for actual (rows)
  row_order <- confusion_data %>%
    group_by(actual) %>%
    summarise(max_pmi = max(pmi, na.rm = TRUE)) %>%
    arrange(desc(max_pmi)) %>%
    pull(actual)
  
  # Get ordering for predicted (columns)
  col_order <- confusion_data %>%
    group_by(predicted) %>%
    summarise(max_pmi = max(pmi, na.rm = TRUE)) %>%
    arrange(desc(max_pmi)) %>%
    pull(predicted)
  
  confusion_data %>%
    mutate(
      actual = factor(actual, levels = row_order),
      predicted = factor(predicted, levels = col_order)
    )
}

# Plotting functions
plot_confusion_counts <- function(confusion_data, title, x_lab, y_lab) {
  ggplot(confusion_data, aes(x = predicted, y = actual, fill = count)) +
    geom_tile(color = "gray80", linewidth = 0.2) +
    geom_text(aes(label = count), color = "black", size = 3) +
    scale_fill_gradient(low = "white", high = "steelblue") +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
      plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
    ) +
    labs(title = title, x = x_lab, y = y_lab, fill = "Count") +
    coord_fixed()
}

plot_confusion_proportions <- function(confusion_data, title, x_lab, y_lab) {
  ggplot(confusion_data, aes(x = predicted, y = actual, fill = prop)) +
    geom_tile(color = "gray80", linewidth = 0.2) +
    geom_text(aes(label = sprintf("%.2f", prop)), color = "black", size = 3) +
    scale_fill_gradient(low = "white", high = "steelblue") +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
      plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
    ) +
    labs(title = title, x = x_lab, y = y_lab, fill = "Proportion") +
    coord_fixed()
}

plot_confusion_pmi <- function(confusion_data, title, x_lab, y_lab) {
  ggplot(confusion_data, aes(x = predicted, y = actual, fill = pmi)) +
    geom_tile(color = "gray80", linewidth = 0.2) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
      plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
    ) +
    labs(title = title, x = x_lab, y = y_lab, fill = "PMI") +
    coord_fixed()
}

# Define all activities for each category
all_main_activities <- comparison_data |> 
  filter(!(Activity_cleaned %in% all_posture_activities)) |> 
  pull(Activity_cleaned) |> 
  unique()


# POSTURE: Human-Human
posture_human_human <- create_pairwise_comparisons(
  activity_separate, 
  c("posture_DD", "posture_JY", "posture_VM")
) %>%
  create_confusion_data() %>%
  add_proportions_and_pmi(all_posture_activities) %>%
  order_by_max_pmi()

plot_confusion_counts(posture_human_human, "Confusion counts: inter-rater postural activities", "All raters", "All other raters")

plot_confusion_proportions(posture_human_human, "Posture Human-Human: Proportions", "Human", "Human")

plot_confusion_pmi(posture_human_human, "Posture Human-Human: PMI", "Human", "Human")

# MAIN ACTIVITY: Human-Human
main_human_human <- create_pairwise_comparisons(
  activity_separate, 
  c("main_DD", "main_JY", "main_VM")
) %>%
  create_confusion_data() %>%
  add_proportions_and_pmi(all_main_activities) %>%
  order_by_max_pmi()

plot_confusion_counts(main_human_human, "Confusion counts: inter-rater non-postural activities", "All raters", "All other raters")

plot_confusion_proportions(main_human_human, "Main Activity Human-Human: Proportions", "Human", "Human")

plot_confusion_pmi(main_human_human, "PMI: inter-rater non-postural activities", "All raters", "All other raters")

# ============================================
# HUMAN-VQA COMPARISONS
# ============================================

# POSTURE: Human-VQA
posture_human_vqa <- bind_rows(
  comparison_activities %>% filter(!is.na(vqa_posture)) %>%
    select(actual = posture_DD, predicted = vqa_posture),
  comparison_activities %>% filter(!is.na(vqa_posture)) %>%
    select(actual = posture_JY, predicted = vqa_posture),
  comparison_activities %>% filter(!is.na(vqa_posture)) %>%
    select(actual = posture_VM, predicted = vqa_posture)
) %>%
  create_confusion_data() %>%
  add_proportions_and_pmi(all_posture_activities) %>%
  order_by_max_pmi()

plot_confusion_counts(posture_human_vqa, "Postural Human-VQA: Counts", "VQA", "Human")

plot_confusion_proportions(posture_human_vqa, "Postural Human-VQA: Proportions", "VQA", "Human")

plot_confusion_pmi(posture_human_vqa, "Postural Human-VQA: PMI", "VQA", "Human")

# MAIN ACTIVITY: Human-VQA
main_human_vqa <- bind_rows(
  comparison_activities %>% filter(!is.na(vqa_main)) %>%
    select(actual = main_DD, predicted = vqa_main),
  comparison_activities %>% filter(!is.na(vqa_main)) %>%
    select(actual = main_JY, predicted = vqa_main),
  comparison_activities %>% filter(!is.na(vqa_main)) %>%
    select(actual = main_VM, predicted = vqa_main)
) %>%
  create_confusion_data() %>%
  add_proportions_and_pmi(all_main_activities) %>%
  order_by_max_pmi()

plot_confusion_counts(main_human_vqa, "Non-postural activity Human-VQA: Counts", "VQA", "Human")

plot_confusion_proportions(main_human_vqa, "Non-postural activity Human-VQA: Proportions", "VQA", "Human")

plot_confusion_pmi(main_human_vqa, "Non-postural activity Human-VQA: PMI", "VQA", "Human")

expanded contexts within rater (all activities together)

activities_expanded <- comparison_data_expanded |> 
  rename(Location_model = Location_cleaned,
         Activity_model = Activity_cleaned) |>
  rowwise() |>
  filter(Activity_model %in% expanded_activities) |>
  group_by(annotator) |>
  count() |>
  mutate(mean = n / 100)

locations_expanded <- comparison_data_expanded |> 
  rename(Location_model = Location_cleaned,
         Activity_model = Activity_cleaned) |>
  rowwise() |>
  filter(Location_model %in% expanded_locations) |>
  group_by(annotator) |>
  count() |> 
  mutate(mean = n / 100)
  
paste("Mean activity precision (when including other activities):", round(mean(activities_expanded$mean), 3))

[1] "Mean activity precision (when including other activities): 0.513"

paste("Mean locatin precision (when including other location):", round(mean(locations_expanded$mean), 3))

[1] "Mean locatin precision (when including other location): 0.687"

saveRDS(comparison_data_expanded, here("data/completed_annotations/comparison_annotation_data.Rds"))

expanded contexts across raters (all activities together)

comparison_data_expanded |>
  mutate(Location_model = str_trim(Location_cleaned),
         Activity_model = str_trim(Activity_cleaned)) |>
  rowwise() |>
  mutate(correct_activity_annotation = Activity_model %in% expanded_activities,
         correct_location_annotation = Location_model %in% expanded_locations) |>
  group_by(video_id) |>
  summarize(activity_correct = any(correct_activity_annotation),
            location_correct = any(correct_location_annotation)) |>
  ungroup() |>
  summarize(loc_acc = sum(location_correct==TRUE)/n(),
            act_acc = sum(activity_correct==TRUE)/n())

# A tibble: 1 × 2
  loc_acc act_acc
    <dbl>   <dbl>
1    0.82    0.67

print("Mean number of activities")

[1] "Mean number of activities"

mean(sapply(comparison_data_expanded$expanded_activities, length))

[1] 1.65

sd(sapply(comparison_data_expanded$expanded_activities, length))

[1] 0.6750449

print("Mean number of locations")

[1] "Mean number of locations"

mean(sapply(comparison_data_expanded$expanded_locations, length))

[1] 1.19

sd(sapply(comparison_data_expanded$expanded_locations, length))

[1] 0.409625

descriptions

paste("Mean video description accuracy:", round(mean(comparison_data_expanded$Video.description.accuracy..1.5.), 2),
      "SD video description accuracy:", round(sd(comparison_data_expanded$Video.description.accuracy..1.5.), 2))

[1] "Mean video description accuracy: 3.27 SD video description accuracy: 1.37"

ratings <- accuracy_wide[, c("Accuracy_DD", "Accuracy_JY", "Accuracy_VM")]

kripp_ratings <- irr::kripp.alpha(t(ratings), method = "ordinal")
paste("IRR for video descriptions:", round(kripp_ratings$value, 2))

[1] "IRR for video descriptions: 0.52"

agreement <- ratings |>
  filter(Accuracy_DD == Accuracy_JY & Accuracy_JY == Accuracy_VM) |>
  count()
paste("Number of frames for which video description accuracy rating matches across all raters:", agreement$n)

[1] "Number of frames for which video description accuracy rating matches across all raters: 12"