Are semantic networks related to demographic variables? Summary statistics of data from Small World of Words.
d = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")
d.clean = d %>%
filter(gender == "Ma"| gender == "Fe") %>%
filter(education > 0) %>%
filter(nativeLanguage != "") %>%
mutate(gender = droplevels(gender),
gender = plyr::revalue(gender,c("Fe" = "F", "Ma" = "M")),
userID = as.factor(userID),
nativeLanguage = as.factor(tolower(nativeLanguage)))
d.clean = d.clean %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
Merge in frquency data. Frequency taken here from subtlex-us (Brysbaert & New, 2009).
subtlexus.url <- getURL("https://raw.githubusercontent.com/mllewis/RC/master/data/corpus/SUBTLEXus_corpus.txt")
freqs <- read.table(text = subtlexus.url, header=TRUE) %>%
select(Word,Lg10WF)
freq.data = d.clean %>%
left_join(freqs, by = c("cue" = "Word")) %>%
rename(cue.freq.log = Lg10WF) %>%
left_join(freqs, by = c("a1" = "Word")) %>%
rename(a1.freq.log = Lg10WF) %>%
left_join(freqs, by = c("a2" = "Word")) %>%
rename(a2.freq.log = Lg10WF) %>%
left_join(freqs, by = c("a3" = "Word")) %>%
rename(a3.freq.log = Lg10WF)
freq.ms.type = freq.data %>%
gather("word_type", "freq", 10:13) %>%
mutate(word_type= as.factor(word_type)) %>%
group_by(userID, word_type) %>%
summarise(mean.freq = mean(freq)) %>%
group_by(word_type) %>%
multi_boot_standard(column = "mean.freq", na.rm = T)
levels(freq.ms.type$word_type) = c("a1", "a2", "a3", "cue")
freq.ms.type$word_type = factor(freq.ms.type$word_type,levels(freq.ms.type$word_type)[c(4, 1:3)])
ggplot(freq.ms.type, aes(x = word_type, y = mean, group = 1)) +
geom_line(color = "black") +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Word Type") +
ylab("Mean log frequency") +
theme_bw(base_size = 15) +
theme(legend.position="none")
Might be interesting to condition on cue here, by binning frequency.
Mean frequency
freq.ms = freq.data %>%
gather("association", "freq", 11:13) %>%
group_by(userID) %>%
summarise(mean.assoc.freq = mean(freq, na.rm = T))
freq.ed.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(education) %>%
multi_boot_standard(column = "mean.assoc.freq", na.rm = T) %>%
filter(education > 1)
ggplot(freq.ed.ms, aes(y = mean, x = education)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line() +
theme_bw(base_size = 15) +
ylab("mean log frequency of associates") +
scale_x_continuous(breaks = c(2,3,4,5), labels = c("elementary", "HS",
"bachelor", "master"))
Change in frequency across associates.
freq.ms.as = freq.data %>%
group_by(userID) %>%
summarise(mean.a1.freq = mean(a1.freq.log, na.rm = T),
mean.a2.freq = mean(a2.freq.log, na.rm = T),
mean.a3.freq = mean(a3.freq.log, na.rm = T))
freq.ms.as2 = freq.ms.as %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
gather(association, value, 2:4) %>%
group_by(education, association) %>%
multi_boot_standard(column = "value", na.rm = T) %>%
filter(education > 1) %>%
ungroup() %>%
mutate(education = as.factor(education))
ggplot(freq.ms.as2, aes(y = mean, x = association,
group = education, color = education)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line()+
theme_bw(base_size = 15) +
ylab("mean log frequency of associates") +
scale_x_discrete(labels=c("1", "2", "3")) +
scale_colour_discrete(labels = c("elementary", "HS", "bachelor", "master"))
Frequency decreases across associates, for all age groups.
Mean frequency
freq.age.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age, userID)) %>%
#mutate(age.bin = cut_width(age, width = 2)) %>%
group_by(age) %>%
multi_boot_standard(column = "mean.assoc.freq", na.rm = T)
ggplot(freq.age.ms, aes(y = mean, x = age)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), size = .2) +
#geom_line() +
theme_bw(base_size = 15) +
ylab("mean log frequency of associates")
p2 = freq.age.ms %>%
filter(age > 15 & age < 80) %>%
ggplot(aes(y = mean, x = age)) +
geom_smooth(method = "lm") +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
theme_bw(base_size = 15) +
xlab("Age (years)") +
ylab("Mean log frequency of associates") +
theme(legend.position = "bottom",
legend.key = element_blank(),
legend.background = element_rect(fill = "transparent"),
legend.title = element_blank(),
axis.line= element_line(size = 40),
axis.text = element_text(colour = "black", size = 17),
axis.title = element_text(colour = "black", size = 17),
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line.x = element_line(color="black", size = .8),
axis.line.y = element_line(color="black", size = .8))
jpeg("freq_SWOW.jpg", width = 5, height = 5, units = 'in', res = 300)
p2
dev.off()
## quartz_off_screen
## 2
freq.all.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, age, userID))
tidy(lm(mean.assoc.freq~age + education, filter(freq.all.ms, education > 0)))
## term estimate std.error statistic p.value
## 1 (Intercept) 3.160430294 0.0111460942 283.5459885 0.000000e+00
## 2 age -0.004735110 0.0001364198 -34.7098570 5.867963e-256
## 3 education -0.001223986 0.0026023239 -0.4703436 6.381150e-01
Education does not predict frequency, controling for age.
Change in frequency across associates.
freq.ms.as2 = freq.ms.as %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age, userID)) %>%
mutate(age.bin = cut_width(age, width = 25)) %>%
gather(association, value, 2:4) %>%
group_by(age.bin, association) %>%
multi_boot_standard(column = "value", na.rm = T)
ggplot(freq.ms.as2, aes(y = mean, x = association,
group = age.bin, color = age.bin)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line()+
facet_grid(.~age.bin) +
theme_bw(base_size = 15) +
ylab("mean log frequency of associates") +
scale_x_discrete(labels=c("1", "2", "3"))
Mean frequency
freq.gender.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
group_by(gender) %>%
multi_boot_standard(column = "mean.assoc.freq", na.rm = T)
ggplot(freq.gender.ms, aes(y = mean, x = gender)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), size = .2) +
theme_bw(base_size = 15) +
ylab("mean log frequency of associates")
freq.all.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, age, education, userID))
tidy(lm(mean.assoc.freq~age + gender + education, freq.all.ms))
## term estimate std.error statistic p.value
## 1 (Intercept) 3.189741105 0.0113581872 280.831884 0.000000e+00
## 2 age -0.004844193 0.0001361785 -35.572383 2.532683e-268
## 3 genderM -0.049420917 0.0040406778 -12.230848 2.867446e-34
## 4 education -0.003309065 0.0025977441 -1.273822 2.027420e-01
Females have higher frequency associates, controling for age and education.
Change in frequency across associates.
freq.ms.as2 = freq.ms.as %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
gather(association, value, 2:4) %>%
group_by(gender, association) %>%
multi_boot_standard(column = "value", na.rm = T)
ggplot(freq.ms.as2, aes(y = mean, x = association,
group = gender, color = gender)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line()+
theme_bw(base_size = 15) +
ylab("mean log frequency of associates") +
scale_x_discrete(labels=c("1", "2", "3"))
Mean frequency
top_langs = d.clean %>%
group_by(userID) %>%
slice(1) %>%
group_by(nativeLanguage) %>%
summarise(n = n()) %>%
top_n(10,n) %>%
select(nativeLanguage) %>%
mutate(bilingual = c(1,1,0,0,0,1,0,0,0,1)) %>%
filter(nativeLanguage != "nan")
freq.ms = freq.data %>%
gather("association", "freq", 11:13) %>%
group_by(userID) %>%
summarise(mean.assoc.freq = mean(freq, na.rm = T))
freq.lang.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(nativeLanguage, userID)) %>%
filter(nativeLanguage %in% levels(droplevels(top_langs$nativeLanguage))) %>%
group_by(nativeLanguage) %>%
multi_boot_standard(column = "mean.assoc.freq", na.rm = T)
ggplot(freq.lang.ms, aes(y = mean, x = reorder(nativeLanguage,-mean))) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower), size = .2) +
theme_bw(base_size = 15) +
xlab("country (top 10)") +
ylab("mean log frequency of associates")
Second language speakers have higher frequency associates than native speakers?
freq.all.ms = freq.ms %>%
left_join(freq.data %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, age, education, userID, nativeLanguage)) %>%
left_join(top_langs, by = "nativeLanguage") %>%
filter(nativeLanguage %in% levels(droplevels(top_langs$nativeLanguage)))
tidy(lm(mean.assoc.freq~age + gender + education + nativeLanguage, freq.all.ms))
## term estimate std.error statistic p.value
## 1 (Intercept) 3.185845719 0.0144885033 219.8878420 0.000000e+00
## 2 age -0.004375775 0.0001442277 -30.3393631 1.331313e-196
## 3 genderM -0.053162149 0.0043546879 -12.2080274 4.003483e-34
## 4 education -0.012124774 0.0027919460 -4.3427679 1.415695e-05
## 5 nativeLanguagecan 0.015233118 0.0112818640 1.3502306 1.769612e-01
## 6 nativeLanguagedut 0.166985083 0.0171097170 9.7596637 1.938912e-22
## 7 nativeLanguagefin 0.096732669 0.0178511272 5.4188550 6.085504e-08
## 8 nativeLanguagefla 0.153942809 0.0173505213 8.8725178 7.899749e-19
## 9 nativeLanguagegbr -0.024331508 0.0096554283 -2.5199822 1.174579e-02
## 10 nativeLanguageger 0.122377923 0.0160703280 7.6151478 2.781581e-14
## 11 nativeLanguagespa 0.192901771 0.0138910799 13.8867369 1.369745e-43
## 12 nativeLanguageusa 0.006590866 0.0090269373 0.7301331 4.653196e-01
tidy(lm(mean.assoc.freq~age + gender + education + bilingual, freq.all.ms))
## term estimate std.error statistic p.value
## 1 (Intercept) 3.343196970 0.013557141 246.600446 0.000000e+00
## 2 age -0.004489705 0.000143459 -31.296075 1.062193e-208
## 3 genderM -0.055746649 0.004346521 -12.825580 1.807896e-37
## 4 education -0.011980606 0.002796304 -4.284444 1.842732e-05
## 5 bilingual -0.153584905 0.006507269 -23.602052 4.387146e-121
Relationship between native language and frequency, controling for other stuff.
Read in POS
pos2 = d.clean %>%
gather("category", "word", 6:9) %>%
select(word) %>%
filter(!grepl(" ", word)) %>%
distinct(word)
# write.csv(pos2, "all_words.csv", row.names = F, col.names = F)
# run pos_tagger.py script
pos = read.csv("../data/pos_all_words.csv",
col.names = c("word", "pos"), header = F)
sent = read.csv("../data/sent_all_words.csv",
col.names = c("word", "polarity", "subjectivity"), header = F)
lemma = read.csv("../data/lemma_all_words.csv",
col.names = c("word", "lemma"), header = F)
Make POS human readable (Penn POS)
pos.collapsed = pos %>%
mutate(pos = as.character(pos)) %>%
rowwise() %>%
mutate(pos = if(is.na(pos)) as.character(pos)
else if(pos == "VB" | pos == "VBD" | pos == "VBG" | pos == "VBN" | pos == "VBP" | pos == "VBZ" ) "verb"
else if(pos == "NN" | pos == "NNP" | pos == "NNS") "noun"
else if(pos == "JJ" | pos == "JJR"| pos == "JJS") "adjective"
else if(pos == "RB" | pos == "RBR") "adverb"
else if(pos == "DT") "determiner"
else if(pos == "PRP" | pos == "PRP$"| pos == "WP" | pos == "WDT" | pos == "WP$" | pos == "WRB" ) "pronoun"
else if (pos == "CC") "conjunction"
else if (pos == "CD") "number"
else if (pos == "FW") "foreign word"
else if (pos == "IN") "preposition"
else if (pos == "MD") "modal"
else pos)
Merge datasets.
all_words = d.clean %>%
gather("category", "word", 6:9) %>%
select(word) %>%
filter(!grepl(" ", word)) %>%
distinct(word) %>%
left_join(pos.collapsed) %>%
left_join(sent) %>%
left_join(lemma)
d.all = d.clean %>%
left_join(all_words, by = c("cue" = "word")) %>%
rename(cue.pos = pos,
cue.polarity = polarity,
cue.subjectivity = subjectivity,
cue.lemma = lemma) %>%
left_join(all_words, by = c("a1" = "word")) %>%
rename(a1.pos = pos,
a1.polarity = polarity,
a1.subjectivity = subjectivity,
a1.lemma = lemma) %>%
left_join(all_words, by = c("a2" = "word")) %>%
rename(a2.pos = pos,
a2.polarity = polarity,
a2.subjectivity = subjectivity,
a2.lemma = lemma) %>%
left_join(all_words, by = c("a3" = "word")) %>%
rename(a3.pos = pos,
a3.polarity = polarity,
a3.subjectivity = subjectivity,
a3.lemma = lemma) %>%
select(userID:education, starts_with('cue'),
starts_with('a1'),
starts_with('a2'),
starts_with('a3'))
Distribution of cue words
d.pos.props = d.all %>%
select(cue.pos, a1.pos, a2.pos,a3.pos) %>%
gather("word_type", "pos", 1:4) %>%
mutate(word_type = as.factor(word_type)) %>%
group_by(word_type, pos) %>%
filter(!is.na(pos)) %>%
summarise(n= n()) %>%
mutate(prop = n / sum(n)) %>%
filter(prop > .005) # filter out low frequency POS
levels(d.pos.props$word_type) = c("a1", "a2", "a3", "cue")
d.pos.props$word_type = factor(d.pos.props$word_type,levels(d.pos.props$word_type)[c(4, 1:3)])
ggplot(d.pos.props, aes(x = reorder(pos,-prop), y = prop, fill = pos)) +
facet_grid(.~word_type) +
geom_bar(stat = "identity") +
ylab("proportion") +
xlab("part of speech") +
ggtitle("Distribution of words") +
ylim(0,1)+
theme_bw(base_size = 15) +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
d.pos.props.na = d.all %>%
select(cue.pos, a1.pos, a2.pos,a3.pos) %>%
gather("word_type", "pos", 1:4) %>%
mutate(word_type = as.factor(word_type)) %>%
group_by(word_type, pos) %>%
summarise(n= n()) %>%
mutate(prop = n / sum(n)) %>%
filter(prop > .005) %>% # filter out low frequency POS
filter(is.na(pos))
levels(d.pos.props.na$word_type) = c("a1", "a2", "a3", "cue")
d.pos.props.na$word_type = factor(d.pos.props.na$word_type,levels(d.pos.props.na$word_type)[c(4, 1:3)])
ggplot(d.pos.props.na, aes(x = word_type, y = prop,group = word_type)) +
geom_bar(stat = "identity") +
ylab("proportion") +
xlab("Condition") +
ggtitle("Proportion NA") +
ylim(0,1)+
theme_bw(base_size = 15) +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Most of these untagged words are two word responses, some of which we probably want to include (e.g. “lunch box”), but not sure how to filter those from non-single-item phrases (“not sure” or “pants and trousers”).
pos.counts = d.all %>%
filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
gather("a.pos", "pos", c(12,17,22)) %>%
select(-cue, -a1, -a2, -a3, -contains("polarity"), -contains("subjectivity"), -contains("lemma")) %>%
filter(pos == "noun" | pos == "adjective" | pos == "verb")
pos.props.all = pos.counts %>%
group_by(pos, userID) %>%
summarise(n = n()) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
left_join(pos.counts %>% group_by(userID) %>% filter(!is.na(pos)) %>%
summarise(all = n())) %>%
mutate(prop = n/all) %>%
group_by(education, pos) %>%
multi_boot_standard(column = "prop", na.rm = T) %>%
filter(education > 1) %>%
ungroup() %>%
mutate(education = as.factor(education))
ggplot(pos.props.all, aes(x = reorder(pos, -mean), y = mean,
group = education, color = education)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("association pos") +
ylab("mean prop association pos") +
geom_line() +
geom_point() +
theme_bw(base_size = 15)
pos.counts = d.all %>%
filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
filter(education > 1) %>%
group_by(cue.pos, education) %>%
summarise(n = n(),
a1_noun = length(which(a1.pos == "noun")),
a1_adjective = length(which(a1.pos == "adjective")),
a1_verb = length(which(a1.pos == "verb")),
a2_noun = length(which(a2.pos == "noun")),
a2_adjective = length(which(a2.pos == "adjective")),
a2_verb = length(which(a2.pos == "verb")),
a3_noun = length(which(a3.pos == "noun")),
a3_adjective = length(which(a3.pos == "adjective")),
a3_verb = length(which(a3.pos == "verb")))
pos.props = pos.counts %>%
gather("var", "value", 4:12) %>%
mutate(prop = value/n) %>%
mutate(a = unlist(lapply(strsplit(var,"_"),function(x) x[1])),
a_pos = unlist(lapply(strsplit(var,"_"),function(x) x[2]))) %>%
select(-var) %>%
mutate(education = as.factor(education))
ggplot(pos.props, aes(x = reorder(a_pos,-prop),
y = prop, group = education, color = education)) +
geom_line(stat= "identity") +
geom_point(stat= "identity") +
facet_grid(cue.pos~a ) +
xlab("association part of speech") +
theme_bw()
No differences of POS by education.
d.pos.age = d.all %>%
filter(age > 15 & age < 75) %>%
mutate(age.bin = cut_width(age, width = 10))
pos.prop.age = d.all %>%
select(userID, age, gender, contains("pos")) %>%
gather("a.pos", "pos", c(5:7)) %>%
filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
group_by(userID) %>%
summarise(noun = length(which(pos == "noun"))/n(),
adjective = length(which(pos == "adjective"))/n(),
verb = length(which(pos == "verb"))/n()) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
gather("type", "prop", 2:4) %>%
group_by(age.bin, type) %>%
multi_boot_standard(column = "prop", na.rm = T) %>%
filter(!is.na(age.bin))
ggplot(pos.prop.age, aes(x = age.bin, y = mean, colour = type, group = type)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line() +
xlab("association pos") +
ylab("age bin") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
No differences of POS by age.
pos.prop.age.cuetype = d.all %>%
select(userID, age, gender, contains("pos")) %>%
gather("a.pos", "pos", c(5:7)) %>%
filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
group_by(userID, cue.pos) %>%
summarise(noun = length(which(pos == "noun"))/n(),
adjective = length(which(pos == "adjective"))/n(),
verb = length(which(pos == "verb"))/n()) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
gather("a.pos", "prop", 3:5) %>%
group_by(age.bin, a.pos, cue.pos) %>%
multi_boot_standard(column = "prop", na.rm = T) %>%
filter(!is.na(age.bin))
ggplot(pos.prop.age.cuetype, aes(x = age.bin, y = mean,
colour = a.pos, group = a.pos)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
facet_grid(.~cue.pos) +
xlab("age bin") +
ylab("Proportion assocations") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
This plot is faceted by the cue POS. So, each point is the probability of giving an associate of a particular POS, given a cue POS, across development.
It looks like there may be an effect of age given a verb cue: Older people are more likely to respond with another verb than younger people (who respond with nouns). A similar story holds for adjectives.
pos.prop.gender = d.all %>%
select(userID, age, gender, contains("pos")) %>%
gather("a.pos", "pos", c(5:7)) %>%
filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
group_by(userID) %>%
summarise(noun = length(which(pos == "noun"))/n(),
adjective = length(which(pos == "adjective"))/n(),
verb = length(which(pos == "verb"))/n()) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
gather("type", "prop", 2:4) %>%
group_by(gender, type) %>%
multi_boot_standard(column = "prop", na.rm = T) %>%
filter(!is.na(gender))
ggplot(pos.prop.gender, aes(x = reorder(type,-mean),
y = mean, group = gender,
color = gender)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line() +
xlab("association pos") +
ylab("prop associations") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
No effect of gender.
pos.prop.gender.cuetype = d.all %>%
select(userID, age, gender, contains("pos")) %>%
gather("a.pos", "pos", c(5:7)) %>%
filter(pos == "noun" | pos == "adjective" | pos == "verb") %>%
filter(cue.pos == "noun" | cue.pos == "adjective" | cue.pos == "verb") %>%
group_by(userID, cue.pos) %>%
summarise(noun = length(which(pos == "noun"))/n(),
adjective = length(which(pos == "adjective"))/n(),
verb = length(which(pos == "verb"))/n()) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
gather("a.pos", "prop", 3:5) %>%
group_by(gender, a.pos, cue.pos) %>%
multi_boot_standard(column = "prop", na.rm = T) %>%
filter(!is.na(gender))
ggplot(pos.prop.gender.cuetype, aes(x = reorder(a.pos, -mean), y = mean,
colour = gender, group = gender)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
facet_grid(.~cue.pos) +
xlab("Gender") +
ylab("Prop. assocations") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Read in concreteness (from Brysbaert et al., 2013).
conc <- read.csv("../data/brysbaert_corpus.csv",header=TRUE) %>%
select(Word, Conc.M)
d.all.conc = d.all %>%
left_join(conc, by = c("cue.lemma" = "Word")) %>%
rename(cue.conc = Conc.M) %>%
left_join(conc, by = c("a1.lemma" = "Word")) %>%
rename(a1.conc = Conc.M) %>%
left_join(conc, by = c("a2.lemma" = "Word")) %>%
rename(a2.conc = Conc.M) %>%
left_join(conc, by = c("a3.lemma" = "Word")) %>%
rename(a3.conc = Conc.M) %>%
select(userID:education, contains(".conc"))
all.conc.means = d.all.conc %>%
gather("word_type", "concreteness", 6:9) %>%
mutate(word_type = as.factor(word_type)) %>%
filter(!is.na(concreteness)) %>%
group_by(userID, word_type) %>%
summarise(mean.concreteness = mean(concreteness)) %>%
group_by(word_type) %>%
multi_boot_standard(column = "mean.concreteness", na.rm = T)
levels(all.conc.means$word_type) = c("a1", "a2", "a3", "cue")
all.conc.means$word_type = factor(all.conc.means$word_type,levels(all.conc.means$word_type)[c(4, 1:3)])
ggplot(all.conc.means, aes(x = word_type, y = mean,
group = 1)) +
geom_line() +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower))+
xlab("Word type") +
ylab("Mean concreteness of word type") +
theme_bw(base_size = 15) +
theme(legend.position = "none")
d.conc.educ = d.all.conc %>%
gather("a.type", "concreteness", 7:9) %>%
filter(!is.na(concreteness)) %>%
group_by(userID) %>%
summarise(mean.concreteness= mean(concreteness)) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(education) %>%
filter(education > 1) %>%
filter(!is.na(education)) %>%
multi_boot_standard(column = "mean.concreteness", na.rm = T)
ggplot(d.conc.educ, aes(x = education, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Education") +
ylab("Mean concreteness of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
d.conc.age = d.all.conc %>%
gather("a.type", "concreteness", 7:9) %>%
filter(!is.na(concreteness)) %>%
group_by(userID) %>%
summarise(mean.concreteness= mean(concreteness)) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
group_by(age.bin) %>%
filter(!is.na(age.bin))%>%
multi_boot_standard(column = "mean.concreteness", na.rm = T)
ggplot(d.conc.age, aes(x = age.bin, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Age bin") +
ylab("Mean concretness of associates") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
d.conc.age.atype = d.all.conc %>%
gather("a.type", "concreteness", 7:9) %>%
filter(!is.na(concreteness)) %>%
group_by(userID, a.type) %>%
summarise(mean.concreteness= mean(concreteness)) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
group_by(age.bin, a.type) %>%
filter(!is.na(age.bin))%>%
multi_boot_standard(column = "mean.concreteness", na.rm = T)
ggplot(d.conc.age.atype, aes(x = age.bin, y = mean,
group = a.type, color = a.type)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
geom_line() +
xlab("Age bin") +
ylab("Mean concretness of associates") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
d.conc.gender = d.all.conc %>%
gather("a.type", "concreteness", 7:9) %>%
filter(!is.na(concreteness)) %>%
group_by(userID) %>%
summarise(mean.concreteness= mean(concreteness)) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
group_by(gender) %>%
filter(!is.na(gender)) %>%
multi_boot_standard(column = "mean.concreteness", na.rm = T)
ggplot(d.conc.gender , aes(x = gender, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Gender") +
ylab("Mean concreteness of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15)
The textblob toolkit has both polarity and subjectivity measures, so I just threw them in, but there’s a lot of missing data for both measures.
all.pol.means = d.all %>%
select(userID, education, contains("polarity")) %>%
gather("word_type", "polarity", 2:5) %>%
mutate(word_type = as.factor(word_type)) %>%
filter(!is.na(polarity)) %>%
filter(polarity != 0) %>%
group_by(userID, word_type) %>%
summarise(mean.polarity= mean(polarity)) %>%
group_by(word_type) %>%
multi_boot_standard(column = "mean.polarity", na.rm = T)
levels(all.pol.means$word_type) = c("a1", "a2", "a3", "cue")
all.pol.means$word_type = factor(all.pol.means$word_type,levels(all.pol.means$word_type)[c(4, 1:3)])
ggplot(all.pol.means, aes(x = word_type, y = mean,
group = 1)) +
geom_line() +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower))+
xlab("Word type") +
ylab("Mean polarity of word type") +
theme_bw(base_size = 15) +
theme(legend.position = "none")
d.polarity.educ = d.all %>%
select(userID, education, contains("polarity"), -cue.polarity) %>%
gather("a.type", "polarity", 3:5) %>%
filter(!is.na(polarity)) %>%
filter(polarity != 0) %>%
group_by(userID) %>%
summarise(mean.polarity = mean(polarity)) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(education) %>%
filter(education > 1) %>%
filter(!is.na(education)) %>%
multi_boot_standard(column = "mean.polarity", na.rm = T)
ggplot(d.polarity.educ, aes(x = education, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Education") +
ylab("Mean polarity of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
d.polarity.age = d.all %>%
select(userID, education, contains("polarity"), -cue.polarity) %>%
gather("a.type", "polarity", 3:5) %>%
filter(!is.na(polarity)) %>%
filter(polarity != 0) %>%
group_by(userID) %>%
summarise(mean.polarity = mean(polarity)) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
group_by(age.bin) %>%
filter(!is.na(age.bin)) %>%
multi_boot_standard(column = "mean.polarity", na.rm = T)
ggplot(d.polarity.age, aes(x = age.bin, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Age bin") +
ylab("Mean polarity of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Older folks are happier?
d.polarity.gender = d.all %>%
select(userID, education, contains("polarity"), -cue.polarity) %>%
gather("a.type", "polarity", 3:5) %>%
filter(!is.na(polarity)) %>%
filter(polarity != 0) %>%
group_by(userID) %>%
summarise(mean.polarity = mean(polarity)) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
group_by(gender) %>%
filter(!is.na(gender)) %>%
multi_boot_standard(column = "mean.polarity", na.rm = T)
ggplot(d.polarity.gender, aes(x = gender, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Education") +
ylab("Mean polarity of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
all.subj.means = d.all %>%
select(userID, education, contains("subjectivity")) %>%
gather("word_type", "subjectivity", 2:5)%>%
mutate(word_type = as.factor(word_type)) %>%
filter(!is.na(subjectivity)) %>%
filter(subjectivity != 0) %>%
group_by(userID, word_type) %>%
summarise(mean.subjectivity = mean(subjectivity)) %>%
group_by(word_type) %>%
multi_boot_standard(column = "mean.subjectivity", na.rm = T)
levels(all.subj.means$word_type) = c("a1", "a2", "a3", "cue")
all.subj.means$word_type = factor(all.subj.means$word_type,levels(all.subj.means$word_type)[c(4, 1:3)])
ggplot(all.subj.means, aes(x = word_type, y = mean,
group = 1)) +
geom_line() +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower))+
xlab("Word Type") +
ylab("Mean subjectivity of word type") +
theme_bw(base_size = 15) +
theme(legend.position = "none")
d.subj.educ = d.all %>%
select(userID, education, contains("subjectivity"), -cue.subjectivity) %>%
gather("a.type", "subjectivity", 3:5) %>%
filter(!is.na(subjectivity)) %>%
filter(subjectivity != 0) %>%
group_by(userID) %>%
summarise(mean.subjectivity = mean(subjectivity)) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(education, userID)) %>%
group_by(education) %>%
filter(education > 1) %>%
filter(!is.na(education)) %>%
multi_boot_standard(column = "mean.subjectivity", na.rm = T)
ggplot(d.subj.educ, aes(x = education, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Education") +
ylab("Mean subjectivity of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
d.subjectivity.age = d.all %>%
select(userID, education, contains("subjectivity"), -cue.subjectivity) %>%
gather("a.type", "subjectivity", 3:5) %>%
filter(!is.na(subjectivity)) %>%
filter(subjectivity != 0) %>%
group_by(userID) %>%
summarise(mean.subjectivity = mean(subjectivity)) %>%
left_join(d.pos.age %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(age.bin, userID)) %>%
group_by(age.bin) %>%
filter(!is.na(age.bin))%>%
multi_boot_standard(column = "mean.subjectivity", na.rm = T)
ggplot(d.subjectivity.age, aes(x = age.bin, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Age bin") +
ylab("Mean subjectivity of associates") +
theme_bw(base_size = 15) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
d.subjectivity.gender = d.all %>%
select(userID, gender, contains("subjectivity"), -cue.subjectivity) %>%
gather("a.type", "subjectivity", 3:5) %>%
filter(!is.na(subjectivity)) %>%
filter(subjectivity != 0) %>%
group_by(userID) %>%
summarise(mean.subjectivity = mean(subjectivity)) %>%
left_join(d.all %>% group_by(userID) %>% slice(1)
%>% ungroup() %>% select(gender, userID)) %>%
group_by(gender) %>%
filter(!is.na(gender)) %>%
multi_boot_standard(column = "mean.subjectivity", na.rm = T)
ggplot(d.subjectivity.gender , aes(x = gender, y = mean)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
xlab("Gender") +
ylab("Mean subjectivity of associates") +
geom_line() +
geom_point() +
theme_bw(base_size = 15)