read in data
d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")
lang.codes = read.csv("../../data/language_codes.csv") %>%
select(ISO639.2BCode, LanguageName)
d.long = d.raw %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
d.clean = d.long %>%
left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
nativeLanguage != "fla" & nativeLanguage != "can" & nativeLanguage != "nan" &
nativeLanguage != "pun" & nativeLanguage != "nl") %>%
mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
as.character(LanguageName)),
LanguageName = as.factor(LanguageName),
country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
country = as.factor(country),
native.lang = ifelse(LanguageName == "English", "english", "other"),
native.lang = as.factor(native.lang)) %>%
select(-nativeLanguage)
collapse across associates
d.all.ca = d.clean %>%
ungroup() %>%
gather("associate.type", "associate", 6:8) %>%
filter(cue != "NA" & associate != "NA" ) %>%
mutate(bigram = paste(cue, associate))
d.all.ca %>%
select(userID, native.lang) %>%
distinct() %>%
group_by(native.lang ) %>%
summarize(n=n()) %>%
kable()
| native.lang | n |
|---|---|
| english | 62412 |
| other | 9369 |
from: Manning and Schuetze (1999; pg. 171)
For each cue compare the distribtion over associates for native and non-native speakers.
x2.scores <- d.all.ca %>%
group_by(native.lang,cue,associate) %>%
summarize(n = n()) %>%
spread(native.lang, n) %>%
filter(!is.na(english) & !is.na(other)) %>%
group_by(cue) %>%
do(x2 = chisq.test(rbind(.$english,.$other))$statistic,
p = chisq.test(rbind(.$english,.$other))$p.value) %>%
mutate(x2 = unlist(x2),
p = unlist(p))
x2 scores
ggplot(x2.scores, aes(x = x2)) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")
Top x2 scores.
x2.scores %>%
arrange(-x2) %>%
select(cue) %>%
slice (1:100) %>%
as.list(.)
## $cue
## [1] spud utensil lavatory reindeer belch
## [6] tablespoon bolts jaguar rural humid
## [11] kid guidance hemisphere celebration drowsy
## [16] mourn autumn scratch can procession
## [21] warrant trip tea pancakes conclude
## [26] leopard pavement doe college honeydew
## [31] scared over projectile yes Superbowl
## [36] would receptionist sunrise dromedary hunt
## [41] mama saucer leather can't terrible
## [46] calf regulations naughty cents mixer
## [51] searching possess single concept fever
## [56] plantation handbag beanie stench tango
## [61] Portland washcloth desire crook zoom
## [66] comma rubber excel chlorine household
## [71] zip hangover physics Washington woodpecker
## [76] cushion frown fearful flexible summit
## [81] unsolved prom swipe brief theft
## [86] final envious cafeteria plates park
## [91] legitimate zucchini viola hamster symphony
## [96] harvest gingerbread disciple average beg
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini
Many of these seem culture-y.
Now let’s see if we can predict the x2-score based on characteristics of the cue.
Join x2.scores and cues characteristics
cues.chars = read.csv ("data/cues_chars.csv")
x2.scores.full = x2.scores %>%
left_join(cues.chars)
Does frequency of cue predict mean x2 for a cue?
freq.x2s = x2.scores.full %>%
select(cue, x2, Lg10WF) %>%
distinct() %>%
filter(!is.na(Lg10WF))
ggplot(freq.x2s, aes(x = Lg10WF, y = x2)) +
geom_smooth(method = "lm") +
theme_bw()
freq.x2s %>%
ungroup() %>%
do(tidy(cor.test(.$x2, .$Lg10WF))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.0243199 | 2.335397 | 0.019544 |
Higher frequency words have more divergence.
Does sentiment of cue predict mean t for a cue? Sentiments from: Finn Arup Nielse
quantsent.x2s = x2.scores.full %>%
select(cue, x2, quant.sent) %>%
distinct()
ggplot(quantsent.x2s, aes(x = quant.sent, y = x2)) +
geom_smooth(method = "lm") +
theme_bw()
quantsent.x2s %>%
ungroup() %>%
do(tidy(cor.test(.$x2, .$quant.sent))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.0185194 | -0.6064539 | 0.5443418 |
| No effect of | sentiment |
Sentiments from NRC Emotion Lexicon from Saif Mohammad and Peter Turney (n ~7000, but some have more than one category)
qual.sent.x2s =x2.scores.full %>%
select(cue, x2, qual.sent) %>%
distinct() %>%
filter(!is.na(qual.sent)) %>%
group_by(qual.sent) %>%
multi_boot_standard(column = "x2")
ggplot(qual.sent.x2s, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
xlab("sentiment") +
ylab("x2") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Not much going on here.
Does pos of cue predict mean x2 for a cue?
pos.x2s = x2.scores.full %>%
select(cue, x2, pos) %>%
distinct() %>%
filter(!is.na(pos)) %>%
group_by(pos) %>%
multi_boot_standard(column = "x2")
ggplot(pos.x2s, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
xlab("pos") +
ylab("x2") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Note much going on with part of speech.
Does concreteness of cue predict x2 for a cue?
conc.x2s = x2.scores.full %>%
select(cue, x2, Conc.M) %>%
distinct()
ggplot(conc.x2s, aes(x = Conc.M, y = x2)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
conc.x2s %>%
ungroup() %>%
do(tidy(cor.test(.$x2, .$Conc.M))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.1113864 | 10.69506 | 0 |
More concreteness, more divergence.
t.scores = read.csv("data/t.scors.by.cue.csv") %>%
select(-1)
x2.scores = x2.scores %>%
left_join(t.scores)
ggplot(x2.scores, aes(x = abs(t), y = x2)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
x2.scores %>%
ungroup() %>%
do(tidy(cor.test(.$x2, abs(.$t)))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| -0.0232808 | -2.333947 | 0.0196181 |
t and x2 are weakly negatively correlated, which is puzzling.