## global variables:
# only if necessary to limit running time
analyze word associations
Notes
load needed pre-processed data
# sets the directory of location of this script as the current directory
# setwd(dirname(rstudioapi::getSourceEditorContext()$path))
# load packages
require(pacman)
p_load('xlsx', '', 'DT', 'igraph', 'tidyverse', 'wordcloud')
# load data
setwd("../generated data")
# for first order associations example
<- xlsx::read.xlsx2(file = "word_associations_firstOrder_adjectives.xlsx", sheetIndex = 1)
associationData_firstOrder
## adjust ID variable
$participant_id <- str_extract(string = associationData_firstOrder$participant_id, pattern = "[:alpha:]*_[:digit:]*") associationData_firstOrder
first order associations
describe data
dim(associationData_firstOrder)
[1] 400 6
table(associationData_firstOrder$cue)
muskulös normalgewichtig übergewichtig untergewichtig
100 100 100 100
length(unique(associationData_firstOrder$participant_id))
[1] 20
dynamic data table:
::datatable(data = associationData_firstOrder) DT
analyze data
get word clouds
function to create word clouds:
<- function(cue_word, data_set) {
generate_wordcloud
# Filter the data set for the given cue word
<- data_set[data_set$cue == cue_word, ]
tmp
# Calculate word frequencies
<- table(tmp$response)
word_frequencies <- as.data.frame(word_frequencies)
word_frequencies_df
# Create the word cloud
wordcloud(words = word_frequencies_df$Var1,
freq = word_frequencies_df$Freq,
min.freq = 1,
max.words = 100,
random.order = FALSE,
colors = brewer.pal(8, "Dark2"))
}
generate_wordcloud("muskulös", associationData_firstOrder)
generate_wordcloud("normalgewichtig", associationData_firstOrder)
generate_wordcloud("übergewichtig", associationData_firstOrder)
generate_wordcloud("untergewichtig", associationData_firstOrder)
get semantic networks
The idea of Network Inference from Fluency Data is motivated by two articles:
- Wulff, D. U., Hills, T. T., & Mata, R. (2022). Structural differences in the semantic networks of younger and older adults. Scientific Reports, 12(1), Article 1. https://doi.org/10.1038/s41598-022-11698-4
- Goñi, J., Arrondo, G., Sepulcre, J., Martincorena, I., Vélez de Mendizábal, N., Corominas-Murtra, B., Bejarano, B., Ardanza-Trevijano, S., Peraita, H., Wall, D. P., & Villoslada, P. (2011). The semantic organization of the animal category: Evidence from semantic verbal fluency and network theory. Cognitive Processing, 12(2), 183–196. https://doi.org/10.1007/s10339-010-0372-x
function to create semantic networks (! without checking if all the edges are “true”, appear more frequently than expected randomly):
# Function to create edges from the grouped responses
<- function(association, distance = 2) {
create_edges <- data.frame()
edges <- length(association)
n
for (i in 1:n) {
for (d in 1:distance) {
if (i + d <= n) {
<- rbind(edges, data.frame(from = association[i], to = association[i + d]))
edges
}
}
}return(edges)
}
<- function(cue_word, data_set, distance = 2) {
generate_association_network # Filter the dataset based on the cue word
<- data_set[data_set$cue == cue_word, ]
tmp
# print number of times a response was given:
print("number of times a response was given")
print(sort(table(tmp$response)))
<- tmp %>%
grouped_responses group_by(participant_id) %>%
summarise(associations = list(response)) %>%
ungroup()
# Apply the function to create edges for each participant
<- do.call(rbind, lapply(grouped_responses$associations, create_edges, distance = distance))
all_edges
# Calculate word frequencies across the entire data set
<- tmp %>%
word_frequencies count(response, name = "frequency")
# Calculate the edge weights (how often each link appears between two words)
<- all_edges %>%
edge_weights group_by(from, to) %>%
summarise(weight = n(), .groups = "drop")
# Create the graph from the edge list with edge weights
<- graph_from_data_frame(edge_weights, directed = FALSE)
g
# Get the list of unique words in the graph (vertices)
<- V(g)$name
vertices
# Extract edge weights to adjust the edge widths
<- E(g)$weight
edge_widths
# Map word frequencies to vertex sizes
<- sapply(vertices, function(word) {
vertex_sizes <- word_frequencies %>%
freq filter(response == word) %>%
pull(frequency)
if(length(freq) == 0) {
return(1) # Default size if the word is not found (shouldn't happen)
else {
} return(freq)
}
})
# Plot the network with vertex sizes proportional to word frequencies and edge widths proportional to edge weights
plot(g,
vertex.label.color = "black",
vertex.size = vertex_sizes * 1.5, # Scale vertex sizes by word frequency
edge.width = edge_widths * 1, # Scale edge widths by weight (number of links)
edge.arrow.size = 0.5)
# Return the igraph object
return(g)
}
<- generate_association_network("muskulös", associationData_firstOrder, distance = 2) tmp_g
[1] "number of times a response was given"
fitt maskulin selbstsicher beeindruckend kräftig
1 1 1 2 2
leistungsfähig fit selbstbewusst trainiert männlich
2 4 7 7 10
sportlich gesund stark attraktiv
10 14 19 20
<- as.matrix(as_adjacency_matrix(tmp_g, attr = "weight"))
mat
<- 10
define_threshold apply(mat, 1, function(row) any(row >= define_threshold)), apply(mat, 2, function(row) any(row >= define_threshold))] mat[
attraktiv gesund stark
attraktiv 0 12 19
gesund 12 0 12
stark 19 12 0
<- generate_association_network("normalgewichtig", associationData_firstOrder, distance = 2) tmp_g
[1] "number of times a response was given"
aktiv kräftig normal stark
1 1 1 1
wünschenswert energiegeladen perfekt wohlgeformt
1 2 2 2
ausgeglichen ausgewogen ideal leistungsfähig
3 3 3 3
optimal selbstbewusst sportlich wohlproportioniert
4 4 4 5
attraktiv fit gesund
20 20 20
<- as.matrix(as_adjacency_matrix(tmp_g, attr = "weight"))
mat
<- 10
define_threshold apply(mat, 1, function(row) any(row >= define_threshold)), apply(mat, 2, function(row) any(row >= define_threshold))] mat[
attraktiv fit gesund
attraktiv 0 20 20
fit 20 0 17
gesund 20 17 0
<- generate_association_network("übergewichtig", associationData_firstOrder, distance = 2) tmp_g
[1] "number of times a response was given"
anfällig auffällig bedenklich
1 1 1
korpulent krankheitsanfällig riskant
1 1 1
schwach übermäßig übermüdet
1 1 1
ungleichmäßig fettleibig problematisch
1 3 3
unschön unsportlich dick
3 3 5
gesundheitsschädlich krankhaft unflexibel
5 5 7
unförmig krank träge
7 9 11
unattraktiv ungesund
12 17
<- as.matrix(as_adjacency_matrix(tmp_g, attr = "weight"))
mat
<- 6
define_threshold apply(mat, 1, function(row) any(row >= define_threshold)), apply(mat, 2, function(row) any(row >= define_threshold))] mat[
krank träge unflexibel unförmig ungesund
krank 0 1 2 4 9
träge 1 0 6 4 6
unflexibel 2 6 0 0 4
unförmig 4 4 0 0 7
ungesund 9 6 4 7 0
<- generate_association_network("untergewichtig", associationData_firstOrder, distance = 2) tmp_g
[1] "number of times a response was given"
ärmlich ausgezehrt besorgt
1 1 1
energielos hungrig mangelhaft
1 1 1
problematisch ungelenk zerbrechlich
1 1 1
abgemagert dünn gesundheitsschädlich
2 2 2
mangelernährt krankhaft anfällig
2 4 5
unattraktiv ungesund fragil
8 8 9
mager krank schwach
13 16 20
<- as.matrix(as_adjacency_matrix(tmp_g, attr = "weight"))
mat
<- 6
define_threshold apply(mat, 1, function(row) any(row >= define_threshold)), apply(mat, 2, function(row) any(row >= define_threshold))] mat[
krank mager schwach unattraktiv ungesund
krank 0 7 16 3 2
mager 7 0 12 3 6
schwach 16 12 0 7 6
unattraktiv 3 3 7 0 1
ungesund 2 6 6 1 0
second order associations (not shown)
just some random code from the associatoR package
<- ar_import(associationData_firstOrder,
ar_obj participant = participant_id,
cue = cue,
response = response,
response_vars = c(response_position))
str(ar_obj)
# normalize responses
<- ar_normalize(ar_obj,
ar_obj case = "most_frequent",
punct = "all",
whitespace = "squish",
process_cues = TRUE)
<- ar_set_targets(ar_obj, targets = "cues")
ar_obj
<- ar_summarize_targets(ar_obj,
ar_obj response_var = response_position,
fun = mean)
$targets
ar_obj
<- ar_embed_targets(ar_obj,
ar_obj method = "ppmi-svd",
n_dim = 100)
<- ar_project_embedding(ar_obj, method = "umap")
projection
ar_plot_wordcloud(
ar_obj,facet_col = cues,
facet_row = NULL,
color_by = cues,
top_n = 20
)
# Importing the associationData_firstOrder data
<- ar_import(data = associationData_firstOrder,
ar_obj participant = participant_id,
cue = cue,
response = response,
participant_vars = c(gender,
agegroup),response_vars = c(response_position))
# export wordlist
# associatoR::ar_wordlist_export(ar_obj,
# file = "outputs/associationData_firstOrder_wordlist.xlsx")
# import wordlist
# ar_obj <- associatoR::ar_wordlist_import(ar_obj,
# file = "associationData_firstOrder_wordlist.csv",
# process_cues = TRUE,
# na = c("NA"))
# ar_obj <- ar_count_targets(ar_obj) # to get frequencies of target
# ar_obj$targets
<- ar_characterize_targets(
ar_obj
ar_obj,characteristics = c("valence", "arousal", "dominance"),
case_sensitive = FALSE
# "word_frequency", "concreteness"
) $targets
ar_obj
<- ar_summarize_targets(ar_obj,
ar_obj response_var = response_position,
fun = mean)
$targets
ar_obj
<- ar_correlate_targets(
ar_obj
ar_obj,participant_vars = c(gender, agegroup),
metric = "auto"
)
$targets$target[order(ar_obj$targets$gender_corr)[1:5]] # more frequently by women
ar_obj$targets$target[order(ar_obj$targets$gender_corr, decreasing = TRUE)[1:5]] # more frequently by man
ar_obj
#######################################
<- ar_cross_targets(ar_obj,
tmp participant_vars = c(gender), # agegroup
target_var = target,
normalize = FALSE)
$target == "empathy",]
tmp[tmp$target == "Elon Musk",]
tmp[tmp#######################################
<- ar_cluster_targets(ar_obj,
ar_obj method = "louvain",
similarity = "cosine",
resolution = 1.1)
table(ar_obj$targets$cluster)
# ar_plot_embedding(
# ar_obj,
# color_by = cluster,
# color_set = "G",
# alpha = 0.5,
# proportion_labels = 1
# )
ar_import(associationData_firstOrder,
participant = participant_id,
cue = cue,
response = response,
participant_vars = c(gender, agegroup),
response_vars = c(response_position, response_level)) %>%
ar_set_targets("cues") %>%
ar_embed_targets() %>%
ar_cluster_targets() %>%
ar_project_embedding() %>%
ar_plot_embedding(color_by = cluster,
proportion_labels = .5)
ar_plot_wordcloud(
ar_obj,facet_col = cluster,
facet_row = NULL,
color_by = cluster,
top_n = 20
)
# ar_cluster_stability(ar_obj)
ar_cross_targets(ar_obj,
participant_vars = c(gender), # agegroup
target_var = cluster,
normalize = TRUE)
ar_compare_targets(ar_obj,
participant_vars = c(gender, agegroup),
target_var = valence,
fun = median, na.rm = TRUE)
ar_compare_embeddings(ar_obj,
participant_vars = c(gender, agegroup),
type = "triangle",
intersection = "pair")
ar_import(associationData_firstOrder,
participant = participant_id,
cue = cue,
response = response,
response_vars = c(response_position)) %>%
ar_set_targets("cues") %>%
ar_embed_targets(method = "counts") %>%
ar_cluster_targets(method = "hclust", k = 3) %>%
ar_project_embedding(method = "mds") %>%
ar_plot_embedding(color_by = cluster,
proportion_labels = .5)
ar_plot_wordcloud(
ar_obj,facet_col = cluster,
facet_row = NULL,
color_by = cluster,
top_n = 20
)