Rows: 3836 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): object, category, FormatsSeen, Frequency, globalID
dbl (8): objectNumber, categoryNumber, trialIndex, reactionTime, TotalCount,...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Load the pre-trained model for Englishud_model <-udpipe_download_model(language ="english")
Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /Users/visuallearninglab/Documents/vedi_survey/vedi/analysis/main/english-ewt-ud-2.5-191206.udpipe
- This model has been trained on version 2.5 of data from https://universaldependencies.org
- The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
- Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
- For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
Downloading finished, model stored at '/Users/visuallearninglab/Documents/vedi_survey/vedi/analysis/main/english-ewt-ud-2.5-191206.udpipe'
# Read the modeludpipe_model <-udpipe_load_model(ud_model$file_model)data(stop_words)
Favorites data analysis
# Clean the data: favorites_clean <- filtered_experience_data |>filter(category =="favorite") |>distinct(globalID, object) |>group_by(globalID) |>mutate(colname =paste0("favorite_", row_number())) |>ungroup() |>pivot_wider(names_from = colname, values_from = object)# Combine all favorite objects into one columnfavorites_combined <- favorites_clean %>%gather(key ="favorite_id", value ="favorite_item", -globalID) %>%pull(favorite_item)cleaned_favorites <- favorites_combined %>%str_to_lower() %>%str_replace_all("[^[:alnum:] ]", "") |>str_replace_all("s$", "") # Remove non-alphanumeric characters and plurals# Use UDPipe to lemmatize the textannotations <-udpipe_annotate(udpipe_model, x = cleaned_favorites)annotations_df <-as.data.frame(annotations)# Filter to only keep the lemmas (base forms of the words)word_tokens_clean <- annotations_df %>%select(doc_id, lemma) %>%filter(!lemma %in% stop_words$word) # Remove stop wordsfavorites_with_lemmas <- word_tokens_clean %>%group_by(doc_id) %>%summarise(favorite_phrase =paste(lemma, collapse =" ")) %>%ungroup()# Count the frequency of each lemmaword_counts <- word_tokens_clean %>%count(lemma, sort =TRUE)favorites_with_lemmas_counts <- favorites_with_lemmas |>count(favorite_phrase, sort=TRUE) |>filter(favorite_phrase !="na") |>top_n(20, n)# Select the top 10 most common lemmastop_10_lemmas <- word_counts %>%top_n(20, n) |>filter(lemma !="na")# Plot the top 10 most common lemmasggplot(top_10_lemmas, aes(x =reorder(lemma, n), y = n)) +geom_bar(stat ="identity", fill ="skyblue") +coord_flip() +# Flip the axes for better readabilitylabs(title ="Top 20 most common favorite words", x ="Lemma", y ="Frequency") +theme_minimal()
ggplot(favorites_with_lemmas_counts, aes(x =reorder(favorite_phrase, n), y = n)) +geom_bar(stat ="identity", fill ="skyblue") +coord_flip() +# Flip the axes for better readabilitylabs(title ="Top 20 most common favorite things", x ="Thing", y ="Frequency") +theme_minimal()