Are semantic networks related to demographic variables? This is a preliminary analysis of data from Small World of Words.
d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")
d.clean = d %>%
filter(gender == "Ma"| gender == "Fe") %>%
filter(education > 0) %>%
filter(nativeLanguage != "") %>%
mutate(gender = droplevels(gender),
gender = plyr::revalue(gender,c("Fe" = "F", "Ma" = "M")),
userID = as.factor(userID),
nativeLanguage = as.factor(tolower(nativeLanguage)),
education = as.factor(education),
education = plyr::revalue(education,c("1" = "none",
"2" = "elementary",
"3" = "HS",
"4" = "bachelor",
"5" = "master")))
demo.summary = d.clean %>%
group_by(userID) %>%
slice(1) %>%
ungroup() %>%
select(1:5)
PROP_FEMALE = round(nrow(filter(demo.summary, gender == "F"))/nrow(demo.summary),2)
We have complete demographic data for 18885 participant. 66% are female.
ggplot(demo.summary, aes(x=age)) +
geom_density(alpha=.3, fill="#FF6666") +
geom_vline(xintercept = mean(demo.summary$age), linetype = "longdash") +
theme_bw(base_size = 18)
The population is relatively young (M = 35.37).
ggplot(demo.summary, aes(x=age, group = gender, fill = gender)) +
geom_density(alpha=.3) +
theme_bw(base_size = 18)
lang = demo.summary %>%
group_by(nativeLanguage) %>%
summarise(n = n())
ggplot(lang, aes(x = reorder(nativeLanguage,-n), y = n)) +
geom_bar(stat = "identity") +
xlab("language") +
theme_bw(base_size = 15) +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Most have language “usa”, followed by “great britain”, “canada”, and “australia”. Which means, everyone is an english speaker?
education = demo.summary %>%
group_by(education) %>%
summarise(n = n())
ggplot(education, aes(x = education, y = n)) +
geom_bar(stat = "identity") +
xlab("education level") +
theme_bw(base_size = 18) +
theme(legend.position="none")
Pretty highly educated group.
education.gender = demo.summary %>%
group_by(education) %>%
summarise (n=n()) %>%
left_join(demo.summary %>% group_by(education) %>%
filter(gender == "F") %>%
summarise(female = n()), by= "education") %>%
mutate(male = n - female) %>%
mutate (p.female = female/n,
p.male = male/n) %>%
gather("gender", "prop", 5:6)
ggplot(education.gender, aes(x = education, fill = gender, y = prop)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = PROP_FEMALE, linetype = "longdash") +
xlab("education level") +
theme_bw(base_size = 18)
The dashed line is the total proportion female. Females are slightly more educated than males.
ggplot(demo.summary, aes(x=age, group = education, fill = education)) +
geom_density(alpha=.3) +
theme_bw(base_size = 18)
d.clean %>%
group_by(userID) %>%
summarise(n= n()) %>%
ggplot(aes(x = n)) +
geom_histogram(binwidth = 1) +
xlab("n trials per participants") +
ylab("n participants") +
theme_bw(base_size = 15)
Most participants completed 15, 16, or 18 trials.
word.summary = d.clean %>%
group_by(cue) %>%
summarise(n = n())
ggplot(word.summary, aes(x = reorder(cue,-n), y = n)) +
geom_bar(stat = "identity") +
ylab("n trials")+
xlab("distinct cue words") +
geom_hline(yintercept = mean(word.summary$n), color = "red",
linetype = "longdash") +
theme_bw(base_size = 12) +
theme(legend.position="none",
axis.ticks = element_blank(), axis.text.x = element_blank())
There are 6046 unique cue words. We have 3046 different cue words with more than 75 trials.
word.a.summary = d.clean %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", NA, word)) %>% # replace "don't knows" with NA
group_by(association) %>%
summarise(n_na = length(which(is.na(word))),
n = n(),
prop.missing= n_na/n())
ggplot(word.a.summary, aes(y=prop.missing, x = association)) +
geom_bar(stat = "identity") +
ylab("number missing") +
theme_bw(base_size = 18)
Participants can say they don’t know a word to skip it (but apparently this means they can still enter some associations?). This shows the proportion missing for each assocation, which is about 10%.