── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /Users/visuallearninglab/Documents/babyview-object/video-qa/unconstrained_objects/english-ewt-ud-2.5-191206.udpipe
- This model has been trained on version 2.5 of data from https://universaldependencies.org
- The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
- Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
- For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
Downloading finished, model stored at '/Users/visuallearninglab/Documents/babyview-object/video-qa/unconstrained_objects/english-ewt-ud-2.5-191206.udpipe'
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): item_definition, measure
dbl (3): intercept, slope, aoa
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
object_counts.bar(object_counts |>rename(object = cleaned_lemma) |>filter(grepl(" ", object)), input_title="Proportion of Frames with Each Object", input_n=30)
non_yoloe_word_counts <- object_counts |>filter(!(cleaned_lemma %in% yoloe_objects$class_name)) object_counts.bar(non_yoloe_word_counts |>rename(object = cleaned_lemma), input_title="Objects not in YOLOe but in VQA")
CDI only
aoa_words <- aoa_values |>mutate(item_definition = item_definition |>str_to_lower() |>str_replace_all("\\s*\\([^\\)]+\\)", "")) |>distinct(item_definition) |>pull(item_definition)object_counts.bar(non_yoloe_word_counts |>rename(object = cleaned_lemma) |>filter(object %in% aoa_words), input_title="Objects not in YOLOe but in CDI and in VQA")
missing YOLOe words
yoloe_and_vqa <- object_counts |>inner_join(yoloe_objects |>distinct(class_name, .keep_all=TRUE) |>rename(yoloe_proportion = proportion), by=c("cleaned_lemma"="class_name"))yoloe_only_words <- yoloe_objects |>anti_join(yoloe_and_vqa, by=c("class_name"="cleaned_lemma")) |>rename(object=class_name)object_counts.bar(yoloe_only_words, input_title ="Words in YOLOe but not VQA")
Some of these are missing because of our lemmatization, for example ‘block’ vs ‘blocks’
The following objects are masked from 'package:lubridate':
%--%, union
The following objects are masked from 'package:dplyr':
as_data_frame, groups, union
The following objects are masked from 'package:purrr':
compose, simplify
The following object is masked from 'package:tidyr':
crossing
The following object is masked from 'package:tibble':
as_data_frame
The following objects are masked from 'package:stats':
decompose, spectrum
The following object is masked from 'package:base':
union
library(jsonlite)
Attaching package: 'jsonlite'
The following object is masked from 'package:purrr':
flatten
# --- Step 1: Load JSON relations ---relations <-fromJSON(here("data/unconstrained_objects/wordnet.json"))$links %>%as_tibble()wordmap <-read.csv(here("data/unconstrained_objects/wordnet_word_map.csv")) g <-graph_from_data_frame(relations, directed =TRUE)# --- Get root nodes ---root_nodes <-setdiff(relations$source, relations$target)# --- Get minimum distance from any root to all nodes ---all_nodes <-V(g)$namedistances_df <-map_dfr(root_nodes, function(root) { sp <-distances(g, v = root, to = all_nodes)tibble(target =colnames(sp), distance =as.numeric(sp))}) %>%group_by(target) %>%summarise(min_distance =min(distance, na.rm =TRUE), .groups ="drop")# --- Compute number of direct children for each node ---direct_children_df <-tibble(target =V(g)$name,num_direct_children =degree(g, mode ="out"))# --- Compute direct parent (take the first one if multiple) ---direct_parents_df <-tibble(target =V(g)$name,direct_parent =map_chr(V(g)$name, function(node) { parents <-neighbors(g, node, mode ="in")$nameif (length(parents) >0) parents[1] elseNA_character_ }))# --- Annotate ---syn_to_word <-function(target) {return(word(str_replace(target, "_", " "), 1, sep =fixed(".")))}annotated <- distances_df %>%mutate(cleaned_target =syn_to_word(target)) %>%left_join(wordmap |>transmute(target=X1, object=str_replace(X0, "_", " "))) |>left_join(all_objects, by =c("object")) %>%distinct(object, .keep_all =TRUE) %>%left_join(direct_children_df, by ="target") %>%left_join(direct_parents_df, by ="target")
Joining with `by = join_by(target)`
# --- Final summary: group by distance and aggregate ---summary_df <- annotated %>%group_by(min_distance) %>%summarise(total_objects =n(),proportion =exp(log_prop),total_coverage =sum(proportion),avg_num_direct_children =mean(num_direct_children),.groups ="drop" )
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
# Plotggplot(annotated, aes(x = min_distance, y =exp(log_prop))) +geom_jitter(width =0.1, height =0, alpha =0.4, color ="steelblue") +geom_text(data = summary_df,aes(x = min_distance, y =0.3,label =paste0("n=", total_objects)),size =3, vjust =0) + ggrepel::geom_label_repel(data=annotated |>filter(exp(log_prop) >0.04), aes(label=object)) +scale_x_continuous(breaks = summary_df$min_distance) +labs(x ="Degrees of Separation",y ="Proportion of frames",title ="Object Coverage by Graph Distance" ) +theme_minimal()
Warning: Removed 17 rows containing missing values or values outside the scale range
(`geom_point()`).