calculate_f_score <-function(predicted, actual) {# Convert to character and trim strings predicted <-str_trim(as.character(predicted)) actual <-str_trim(as.character(actual))# Remove NA values valid_indices <-!is.na(predicted) &!is.na(actual) predicted <- predicted[valid_indices] actual <- actual[valid_indices]if(length(predicted) ==0||length(actual) ==0) {return(list(precision =NA, recall =NA, f1 =NA)) }# Get unique labels all_labels <-unique(c(predicted, actual))# Calculate population-level metrics across all labels total_tp <-0 total_fp <-0 total_fn <-0 total_f1s <-c()for(label in all_labels) { tp <-sum(predicted == label & actual == label) fp <-sum(predicted == label & actual != label) fn <-sum(predicted != label & actual == label) precision <-ifelse(tp + fp ==0, 0, tp / (tp + fp)) recall <-ifelse(tp + fn ==0, 0, tp / (tp + fn)) f1 <-ifelse(precision + recall ==0, 0, 2* precision * recall / (precision + recall))if ((tp+fp)>=5) { total_f1s <-c(total_f1s, f1) } total_tp <- total_tp + tp total_fp <- total_fp + fp total_fn <- total_fn + fn }# Population-level precision, recall, and F1 precision <-ifelse(total_tp + total_fp ==0, 0, total_tp / (total_tp + total_fp)) recall <-ifelse(total_tp + total_fn ==0, 0, total_tp / (total_tp + total_fn)) f1 <-ifelse(precision + recall ==0, 0, 2* precision * recall / (precision + recall))#print(paste0("F1 at category level", mean(total_f1s)))return(list(precision = precision, recall = recall, f1 = f1))}
main IRR and precision calculations across raters
# Step 1: Add annotator ID and combine all annotation files (including Other columns)ann_1_labeled <- ann_1 %>%mutate(annotator ="DD") %>% dplyr::select(video_id, Location, Activity, Other.locations, Other.activities, Video.description.accuracy..1.5., annotator)ann_2_labeled <- ann_2 %>%mutate(annotator ="JY") %>% dplyr::select(video_id, Location, Activity, Other.locations, Other.activities, Video.description.accuracy..1.5., annotator)ann_3_labeled <- ann_3 %>%mutate(annotator ="VM") %>% dplyr::select(video_id, Location, Activity, Other.locations, Other.activities, Video.description.accuracy..1.5., annotator)# Combine all annotationsall_annotations <-bind_rows(ann_1_labeled, ann_2_labeled, ann_3_labeled)# Step 2: Filter for rows where Video.description.accuracy is not empty for all three annotatorscomplete_ratings <- all_annotations %>%filter(!is.na(Video.description.accuracy..1.5.) & Video.description.accuracy..1.5.!=""&!is.null(Video.description.accuracy..1.5.)) %>%group_by(video_id) %>%filter(n() ==3) %>%# Only keep video_ids that have all 3 annotationsungroup()# Get the video_ids that have complete ratingscomplete_video_ids <-unique(complete_ratings$video_id)clean_video_ids <-sub("^\\d+_", "", complete_video_ids)print(paste("Number of videos with complete ratings from all three annotators:", length(clean_video_ids)))
[1] "Number of videos with complete ratings from all three annotators: 100"
# Step 3: Find matching rows in df_cleaneddf_matched <- df_cleaned %>%filter(video_id %in% clean_video_ids)print(paste("Number of rows in df_cleaned matching complete annotations:", nrow(df_matched)))
[1] "Number of rows in df_cleaned matching complete annotations: 100"
# Step 4: Prepare data for inter-rater reliability analysis# Pivot wider to have one row per video_id with columns for each annotatorlocation_wide <- complete_ratings %>% dplyr::select(video_id, annotator, Location) %>%pivot_wider(names_from = annotator, values_from = Location, names_prefix ="Location_")activity_wide <- complete_ratings %>% dplyr::select(video_id, annotator, Activity, Other.activities) %>%pivot_wider(names_from = annotator,values_from =c(Activity, Other.activities),names_sep ="_" )# Video description accuracy ratingsaccuracy_wide <- complete_ratings %>%select(video_id, annotator, Video.description.accuracy..1.5.) %>%mutate(Video.description.accuracy..1.5. =as.numeric(Video.description.accuracy..1.5.)) %>%pivot_wider(names_from = annotator, values_from = Video.description.accuracy..1.5., names_prefix ="Accuracy_")
paste("Mean video description accuracy:", round(mean(comparison_data_expanded$Video.description.accuracy..1.5.), 2),"SD video description accuracy:", round(sd(comparison_data_expanded$Video.description.accuracy..1.5.), 2))
[1] "Mean video description accuracy: 3.27 SD video description accuracy: 1.37"
ratings <- accuracy_wide[, c("Accuracy_DD", "Accuracy_JY", "Accuracy_VM")]kripp_ratings <- irr::kripp.alpha(t(ratings), method ="ordinal")paste("IRR for video descriptions:", round(kripp_ratings$value, 2))
[1] "IRR for video descriptions: 0.52"
agreement <- ratings |>filter(Accuracy_DD == Accuracy_JY & Accuracy_JY == Accuracy_VM) |>count()paste("Number of frames for which video description accuracy rating matches across all raters:", agreement$n)
[1] "Number of frames for which video description accuracy rating matches across all raters: 12"