analyze word associations

Author

Julius Fenn

Notes

## global variables: 
# only if necessary to limit running time

load needed pre-processed data

# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))

# load packages
require(pacman)
p_load('xlsx', 'associatoR', 'DT', 'igraph')


# load data
setwd("../generated data")
associationData <- xlsx::read.xlsx2(file = "word_associations_translated.xlsx", sheetIndex = 1) # "word_associations.xlsx"
table(associationData$cue)

     muscular normal weight    overweight   underweight 
           50            50            50            50 
dim(associationData)
[1] 200   6
DT::datatable(data = associationData)

get wordclouds

tmp <- associationData[associationData$cue == "muscular", ]

word_frequencies <- table(tmp$response)
word_frequencies_df <- as.data.frame(word_frequencies)

library(wordcloud)
Lade nötiges Paket: RColorBrewer
# Create the word cloud
wordcloud(words = word_frequencies_df$Var1, 
          freq = word_frequencies_df$Freq, 
          min.freq = 1, 
          max.words = 100, 
          random.order = FALSE, 
          colors = brewer.pal(8, "Dark2"))

tmp <- associationData[associationData$cue == "normal weight", ]

word_frequencies <- table(tmp$response)
word_frequencies_df <- as.data.frame(word_frequencies)

library(wordcloud)
# Create the word cloud
wordcloud(words = word_frequencies_df$Var1, 
          freq = word_frequencies_df$Freq, 
          min.freq = 1, 
          max.words = 100, 
          random.order = FALSE, 
          colors = brewer.pal(8, "Dark2"))

tmp <- associationData[associationData$cue == "overweight", ]

word_frequencies <- table(tmp$response)
word_frequencies_df <- as.data.frame(word_frequencies)

library(wordcloud)
# Create the word cloud
wordcloud(words = word_frequencies_df$Var1, 
          freq = word_frequencies_df$Freq, 
          min.freq = 1, 
          max.words = 100, 
          random.order = FALSE, 
          colors = brewer.pal(8, "Dark2"))

tmp <- associationData[associationData$cue == "underweight", ]

word_frequencies <- table(tmp$response)
word_frequencies_df <- as.data.frame(word_frequencies)

library(wordcloud)
# Create the word cloud
wordcloud(words = word_frequencies_df$Var1, 
          freq = word_frequencies_df$Freq, 
          min.freq = 1, 
          max.words = 100, 
          random.order = FALSE, 
          colors = brewer.pal(8, "Dark2"))

get semantic networks

tmp <- associationData[associationData$cue == "overweight", ]
tmp$response[tmp$participant_id == unique(tmp$participant_id)[1]]
[1] "Health risks"            "Obesity"                
[3] "Diet"                    "Lack of exercise"       
[5] "Loss of self-confidence"
tmp[c("participant_id", "response")]
    participant_id                response
1          Run_1_1            Health risks
2          Run_1_1                 Obesity
3          Run_1_1                    Diet
4          Run_1_1        Lack of exercise
5          Run_1_1 Loss of self-confidence
21         Run_2_1            Health risks
22         Run_2_1                    Diet
23         Run_2_1              Overweight
24         Run_2_1                 Obesity
25         Run_2_1         Health problems
41         Run_3_1         Health problems
42         Run_3_1            Fat deposits
43         Run_3_1      Difficulty walking
44         Run_3_1 Unattractive appearance
45         Run_3_1           Eating habits
61         Run_4_1             Health risk
62         Run_4_1                 Obesity
63         Run_4_1                    Diet
64         Run_4_1              Overweight
65         Run_4_1               Nutrition
81         Run_5_1            Health risks
82         Run_5_1              Overweight
83         Run_5_1                 Obesity
84         Run_5_1            Difficulties
85         Run_5_1 Loss of self-confidence
101        Run_6_1         Health problems
102        Run_6_1                    Diet
103        Run_6_1              Overweight
104        Run_6_1                 Obesity
105        Run_6_1          Dietary change
121        Run_7_1         Health problems
122        Run_7_1                    Diet
123        Run_7_1              Overweight
124        Run_7_1          Dietary change
125        Run_7_1 Loss of self-confidence
141        Run_8_1            Health risks
142        Run_8_1                 Obesity
143        Run_8_1                    Diet
144        Run_8_1              Overweight
145        Run_8_1           Eating habits
161        Run_9_1         Health problems
162        Run_9_1                    Diet
163        Run_9_1                 Obesity
164        Run_9_1    Difficulty in sports
165        Run_9_1 Loss of self-confidence
181       Run_10_1             Health risk
182       Run_10_1                    Diet
183       Run_10_1                 Obesity
184       Run_10_1              Overweight
185       Run_10_1          Dietary change
# Group responses by participant
library(dplyr)
Warning: Paket 'dplyr' wurde unter R Version 4.3.2 erstellt

Attache Paket: 'dplyr'
Die folgenden Objekte sind maskiert von 'package:igraph':

    as_data_frame, groups, union
Die folgenden Objekte sind maskiert von 'package:stats':

    filter, lag
Die folgenden Objekte sind maskiert von 'package:base':

    intersect, setdiff, setequal, union
grouped_responses <- tmp %>%
  group_by(participant_id) %>%
  summarise(associations = list(response)) %>%
  ungroup()



# Function to create edges from the grouped responses
create_edges <- function(association, distance = 2) {
  edges <- data.frame()
  n <- length(association)
  
  for (i in 1:n) {
    for (d in 1:distance) {
      if (i + d <= n) {
        edges <- rbind(edges, data.frame(from = association[i], to = association[i + d]))
      }
    }
  }
  return(edges)
}

# Apply the function to create edges for each participant
all_edges <- do.call(rbind, lapply(grouped_responses$associations, create_edges))

# Calculate word frequencies across the entire dataset
word_frequencies <- tmp %>%
  count(response, name = "frequency")

# Calculate the edge weights (how often each link appears between two words)
edge_weights <- all_edges %>%
  group_by(from, to) %>%
  summarise(weight = n(), .groups = "drop")

# Create the graph from the edge list with edge weights
g <- graph_from_data_frame(edge_weights, directed = FALSE)

# Get the list of unique words in the graph (vertices)
vertices <- V(g)$name

# Map word frequencies to vertex sizes
vertex_sizes <- sapply(vertices, function(word) {
  freq <- word_frequencies %>%
    filter(response == word) %>%
    pull(frequency)
  
  if(length(freq) == 0) {
    return(1)  # Default size if the word is not found (shouldn't happen)
  } else {
    return(freq)
  }
})

# Extract edge weights to adjust the edge widths
edge_widths <- E(g)$weight

# Plot the network with vertex sizes proportional to word frequencies and edge widths proportional to edge weights
plot(g, 
     vertex.label.color = "black", 
     vertex.size = vertex_sizes * 2,           # Scale vertex sizes by word frequency
     edge.width = edge_widths * 1,             # Scale edge widths by weight (number of links)
     edge.arrow.size = 0.5)

sort(table(tmp$response))

           Difficulties    Difficulty in sports      Difficulty walking 
                      1                       1                       1 
           Fat deposits        Lack of exercise               Nutrition 
                      1                       1                       1 
Unattractive appearance           Eating habits             Health risk 
                      1                       2                       2 
         Dietary change            Health risks Loss of self-confidence 
                      3                       4                       4 
        Health problems              Overweight                    Diet 
                      5                       7                       8 
                Obesity 
                      8 
mat <- as.matrix(as_adjacency_matrix(g))
mat[rowSums(mat) >= 5, colSums(mat) >= 5]
                        Diet Health problems Obesity Overweight
Diet                       0               1       2          1
Health problems            1               0       2          2
Obesity                    2               2       0          2
Overweight                 1               2       2          0
Loss of self-confidence    1               0       1          1
                        Loss of self-confidence
Diet                                          1
Health problems                               0
Obesity                                       1
Overweight                                    1
Loss of self-confidence                       0

OLD

ar_obj <- ar_import(associationData,
          participant = participant_id,
          cue = cue,
          response = response,
          response_vars = c(response_position))
str(ar_obj)

# normalize responses
ar_obj <- ar_normalize(ar_obj,
                       case = "most_frequent",
                       punct = "all",
                       whitespace = "squish",
                       process_cues = TRUE)


ar_obj <- ar_set_targets(ar_obj, targets = "cues")


ar_obj <- ar_summarize_targets(ar_obj,
                               response_var = response_position,
                               fun = mean)



ar_obj$targets


ar_obj <- ar_embed_targets(ar_obj,
                           method = "ppmi-svd",
                           n_dim = 100)

projection <- ar_project_embedding(ar_obj, method = "umap")



ar_plot_wordcloud(
  ar_obj,
  facet_col = cues,
  facet_row = NULL,
  color_by = cues,
  top_n = 20
)
# Importing the associationData data
ar_obj <- ar_import(data = associationData,
                    participant = participant_id,
                    cue = cue,
                    response = response,
                    participant_vars = c(gender,
                                         agegroup),
                    response_vars = c(response_position))




# export wordlist
# associatoR::ar_wordlist_export(ar_obj,
#                    file = "outputs/associationData_wordlist.xlsx")
# import wordlist
# ar_obj <- associatoR::ar_wordlist_import(ar_obj,
#                              file = "associationData_wordlist.csv",
#                              process_cues = TRUE,
#                              na = c("NA"))




# ar_obj <- ar_count_targets(ar_obj) # to get frequencies of target
# ar_obj$targets

ar_obj <- ar_characterize_targets(
  ar_obj,
  characteristics = c("valence", "arousal", "dominance"),
  case_sensitive = FALSE
) # "word_frequency", "concreteness"
ar_obj$targets


ar_obj <- ar_summarize_targets(ar_obj,
                               response_var = response_position,
                               fun = mean)
ar_obj$targets



ar_obj <- ar_correlate_targets(
  ar_obj,
  participant_vars = c(gender, agegroup),
  metric = "auto"
)

ar_obj$targets$target[order(ar_obj$targets$gender_corr)[1:5]] # more frequently by women
ar_obj$targets$target[order(ar_obj$targets$gender_corr, decreasing = TRUE)[1:5]] # more frequently by man


#######################################
tmp <- ar_cross_targets(ar_obj,
                        participant_vars = c(gender), # agegroup
                        target_var = target,
                        normalize = FALSE)
tmp[tmp$target == "empathy",]
tmp[tmp$target == "Elon Musk",]
#######################################


ar_obj <- ar_cluster_targets(ar_obj,
                             method = "louvain",
                             similarity = "cosine",
                             resolution = 1.1)
table(ar_obj$targets$cluster)


# ar_plot_embedding(
#   ar_obj,
#   color_by = cluster,
#   color_set = "G",
#   alpha = 0.5,
#   proportion_labels = 1
# )

ar_import(associationData,
          participant = participant_id,
          cue = cue,
          response = response,
          participant_vars = c(gender, agegroup),
          response_vars = c(response_position, response_level)) %>%
  ar_set_targets("cues") %>%
  ar_embed_targets() %>%
  ar_cluster_targets() %>%
  ar_project_embedding() %>%
  ar_plot_embedding(color_by = cluster,
                    proportion_labels = .5)



ar_plot_wordcloud(
  ar_obj,
  facet_col = cluster,
  facet_row = NULL,
  color_by = cluster,
  top_n = 20
)


# ar_cluster_stability(ar_obj)



ar_cross_targets(ar_obj,
                participant_vars = c(gender), # agegroup
                target_var = cluster,
                normalize = TRUE)


ar_compare_targets(ar_obj,
                   participant_vars = c(gender, agegroup),
                   target_var = valence,
                   fun = median, na.rm = TRUE)



ar_compare_embeddings(ar_obj,
                      participant_vars = c(gender, agegroup),
                      type = "triangle",
                      intersection = "pair")


ar_import(associationData,
          participant = participant_id,
          cue = cue,
          response = response,
          response_vars = c(response_position)) %>%
  ar_set_targets("cues") %>%
  ar_embed_targets(method = "counts") %>%
  ar_cluster_targets(method = "hclust", k = 3) %>%
  ar_project_embedding(method = "mds") %>%
  ar_plot_embedding(color_by = cluster,
                    proportion_labels = .5)



ar_plot_wordcloud(
  ar_obj,
  facet_col = cluster,
  facet_row = NULL,
  color_by = cluster,
  top_n = 20
)