Load data

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4          ✔ readr     2.1.5     
✔ forcats   1.0.0          ✔ stringr   1.5.1     
✔ ggplot2   3.5.1.9000     ✔ tibble    3.2.1     
✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
✔ purrr     1.0.4          
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(udpipe)
library(tidytext)
library(here)

here() starts at /Users/visuallearninglab/Documents/vedi_survey/vedi

filtered_experience_data <- read_csv(file.path(here('data','main', 'processed'),"experience-data-cleaned.csv"))

Rows: 3836 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): object, category, FormatsSeen, Frequency, globalID
dbl (8): objectNumber, categoryNumber, trialIndex, reactionTime, TotalCount,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Load the pre-trained model for English
ud_model <- udpipe_download_model(language = "english")

Downloading udpipe model from https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe to /Users/visuallearninglab/Documents/vedi_survey/vedi/analysis/main/english-ewt-ud-2.5-191206.udpipe
 - This model has been trained on version 2.5 of data from https://universaldependencies.org
 - The model is distributed under the CC-BY-SA-NC license: https://creativecommons.org/licenses/by-nc-sa/4.0
 - Visit https://github.com/jwijffels/udpipe.models.ud.2.5 for model license details.
 - For a list of all models and their licenses (most models you can download with this package have either a CC-BY-SA or a CC-BY-SA-NC license) read the documentation at ?udpipe_download_model. For building your own models: visit the documentation by typing vignette('udpipe-train', package = 'udpipe')
Downloading finished, model stored at '/Users/visuallearninglab/Documents/vedi_survey/vedi/analysis/main/english-ewt-ud-2.5-191206.udpipe'

# Read the model
udpipe_model <- udpipe_load_model(ud_model$file_model)
data(stop_words)

Favorites data analysis

# Clean the data: 
favorites_clean <- filtered_experience_data |>
  filter(category == "favorite") |>
  distinct(globalID, object) |>
  group_by(globalID) |>
  mutate(colname = paste0("favorite_", row_number())) |>
  ungroup() |>
  pivot_wider(names_from = colname, values_from = object)

# Combine all favorite objects into one column
favorites_combined <- favorites_clean %>%
  gather(key = "favorite_id", value = "favorite_item", -globalID) %>%
  pull(favorite_item)

cleaned_favorites <- favorites_combined %>%
  str_to_lower() %>%
  str_replace_all("[^[:alnum:] ]", "") |>
  str_replace_all("s$", "") # Remove non-alphanumeric characters and plurals

# Use UDPipe to lemmatize the text
annotations <- udpipe_annotate(udpipe_model, x = cleaned_favorites)
annotations_df <- as.data.frame(annotations)

# Filter to only keep the lemmas (base forms of the words)
word_tokens_clean <- annotations_df %>%
  select(doc_id, lemma) %>%
  filter(!lemma %in% stop_words$word)  # Remove stop words

favorites_with_lemmas <- word_tokens_clean %>%
  group_by(doc_id) %>%
  summarise(favorite_phrase = paste(lemma, collapse = " ")) %>%
  ungroup()

# Count the frequency of each lemma
word_counts <- word_tokens_clean %>%
  count(lemma, sort = TRUE)

favorites_with_lemmas_counts <- favorites_with_lemmas |>
  count(favorite_phrase, sort=TRUE) |>
  filter(favorite_phrase != "na") |>
  top_n(20, n)

# Select the top 10 most common lemmas
top_10_lemmas <- word_counts %>%
  top_n(20, n) |>
  filter(lemma != "na")

# Plot the top 10 most common lemmas
ggplot(top_10_lemmas, aes(x = reorder(lemma, n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip the axes for better readability
  labs(title = "Top 20 most common favorite words", x = "Lemma", y = "Frequency") +
  theme_minimal()

ggplot(favorites_with_lemmas_counts, aes(x = reorder(favorite_phrase, n), y = n)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +  # Flip the axes for better readability
  labs(title = "Top 20 most common favorite things", x = "Thing", y = "Frequency") +
  theme_minimal()

#Do some clustering