Do the statistics of cue-associate bigrams differ across languages?
Read in data and munge
d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")
lang.codes = read.csv("../../data/language_codes.csv") %>%
select(ISO639.2BCode, LanguageName)
d.long = d.raw %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
d.clean = d.long %>%
left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
nativeLanguage != "fla" & nativeLanguage != "can" & nativeLanguage != "nan" &
nativeLanguage != "pun" & nativeLanguage != "nl") %>%
mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
as.character(LanguageName)),
LanguageName = as.factor(LanguageName),
country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
country = as.factor(country),
native.lang = ifelse(LanguageName == "English", "english", "other"),
native.lang = as.factor(native.lang)) %>%
select(-nativeLanguage)
# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 400
lang.dems = d.clean %>%
group_by(userID, LanguageName, native.lang) %>%
slice(1) %>%
group_by(LanguageName, native.lang) %>%
summarise(n = n()) %>%
filter(native.lang == "other") %>%
arrange(-n) %>%
filter(n > N_CUTOFF) %>%
select(-native.lang) %>%
ungroup()
collapse across associates
d.all.ca = d.clean %>%
filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
ungroup() %>%
gather("associate.type", "associate", 6:8) %>%
filter(cue != "NA" & associate != "NA" ) %>%
mutate(bigram = paste(cue, associate))
bigram.counts = d.all.ca %>%
count(native.lang, cue, associate, bigram)
d.all.ca %>%
select(userID, LanguageName, native.lang) %>%
distinct() %>%
group_by(LanguageName,native.lang ) %>%
summarize(n=n()) %>%
kable()
| LanguageName | native.lang | n |
|---|---|---|
| Dutch | other | 687 |
| English | english | 62412 |
| Finnish | other | 459 |
| French | other | 601 |
| German | other | 1176 |
| Italian | other | 411 |
| Spanish | other | 1071 |
From Manning and Schuetze (1999; pg. 166): Discover co-occurences that best-distinguish between words within the same corpus. This is slightly different that our goal, because here we want to compare across corpora, and thus our N’s are different. Here, I modify their formula to account for this (analagous to Welch’s two-sample t-test).
Manning and Schuetze: \[ t = \frac{\frac{C(v_1w)}{N}-\frac{C(v_2w)}{N}} {\sqrt{\frac{C(v_1w) + C(v_2w)}{N^2}}} \]
Welch’s version: \[ t = \frac{\frac{C(v_1w)}{n_1}-\frac{C(v_2w)}{n_2}} {\sqrt{\frac{C(v_1w)}{n_1} + \frac{C(v_2w)}{n_2}}} \]
Question: Are N bigram counts or word counts? Assume they are bigrams.
Compare native to non-native:
total.bigrams.native.lang = bigram.counts %>%
group_by(native.lang) %>%
summarize(total.bigrams = sum(n))
bigram.counts.rf = bigram.counts %>%
left_join(total.bigrams.native.lang) %>%
mutate(rf = n/total.bigrams)
t.scores <- bigram.counts.rf %>%
ungroup() %>%
as_tibble() %>%
select(native.lang, rf, cue, associate) %>%
spread(native.lang, rf) %>%
filter(!is.na(english), !is.na(other)) %>%
mutate(t = (english - other)/sqrt(english + other))
t-scores
ggplot(t.scores, aes(x = t)) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of t-scores")
Large t.scores
t.scores %>%
arrange(-t) %>%
select(cue, associate, t) %>%
slice (1:100) %>%
kable()
| cue | associate | t |
|---|---|---|
| begin | start | 0.0046245 |
| mozzarella | cheese | 0.0046245 |
| cob | corn | 0.0045862 |
| comprehend | understand | 0.0044698 |
| exhausted | tired | 0.0044698 |
| branch | tree | 0.0044305 |
| smooch | kiss | 0.0044305 |
| espresso | coffee | 0.0043107 |
| kitty | cat | 0.0043107 |
| bashful | shy | 0.0042703 |
| yolk | egg | 0.0042703 |
| heifer | cow | 0.0042295 |
| assist | help | 0.0041884 |
| freezing | cold | 0.0041884 |
| ill | sick | 0.0041884 |
| phobia | fear | 0.0041884 |
| chubby | fat | 0.0041470 |
| quick | fast | 0.0041052 |
| chilly | cold | 0.0040632 |
| flamingo | pink | 0.0040632 |
| actual | real | 0.0040208 |
| polar | bear | 0.0040208 |
| curl | hair | 0.0039781 |
| film | movie | 0.0039781 |
| sterling | silver | 0.0039781 |
| wager | bet | 0.0039781 |
| grin | smile | 0.0039350 |
| parchment | paper | 0.0039350 |
| weary | tired | 0.0039350 |
| blackboard | chalk | 0.0038916 |
| cuckoo | clock | 0.0038916 |
| present | gift | 0.0038916 |
| scheme | plan | 0.0038916 |
| celebration | party | 0.0038478 |
| crossword | puzzle | 0.0038478 |
| cub | bear | 0.0038478 |
| however | but | 0.0038478 |
| query | question | 0.0038478 |
| consume | eat | 0.0038036 |
| finances | money | 0.0038036 |
| intoxication | drunk | 0.0038036 |
| stone | rock | 0.0038036 |
| canary | bird | 0.0037591 |
| chapel | church | 0.0037591 |
| congregation | church | 0.0037591 |
| idiotic | stupid | 0.0037591 |
| instruct | teach | 0.0037591 |
| comrade | friend | 0.0037141 |
| cranny | nook | 0.0037141 |
| found | lost | 0.0037141 |
| frequently | often | 0.0037141 |
| rifle | gun | 0.0037141 |
| row | boat | 0.0037141 |
| smiling | happy | 0.0037141 |
| acquaintance | friend | 0.0036688 |
| aged | old | 0.0036688 |
| off | on | 0.0036688 |
| aroma | smell | 0.0036230 |
| bestow | give | 0.0036230 |
| bullet | gun | 0.0036230 |
| hello | hi | 0.0036230 |
| heron | bird | 0.0036230 |
| obsessive | compulsive | 0.0036230 |
| seek | find | 0.0036230 |
| station | train | 0.0036230 |
| thank | you | 0.0036230 |
| unkind | mean | 0.0036230 |
| anchovy | fish | 0.0035769 |
| bumper | car | 0.0035769 |
| corpse | dead | 0.0035769 |
| dagger | knife | 0.0035769 |
| dingo | dog | 0.0035769 |
| ebony | black | 0.0035769 |
| gigantic | huge | 0.0035769 |
| handbag | purse | 0.0035769 |
| holler | yell | 0.0035769 |
| pupil | student | 0.0035769 |
| start | begin | 0.0035769 |
| stump | tree | 0.0035769 |
| briefs | underwear | 0.0035303 |
| certain | sure | 0.0035303 |
| converse | talk | 0.0035303 |
| frightening | scary | 0.0035303 |
| plasma | blood | 0.0035303 |
| portly | fat | 0.0035303 |
| reef | coral | 0.0035303 |
| woodpecker | bird | 0.0035303 |
| amphibian | frog | 0.0034833 |
| cuisine | food | 0.0034833 |
| federal | government | 0.0034833 |
| height | tall | 0.0034833 |
| hoot | owl | 0.0034833 |
| iris | flower | 0.0034833 |
| jingle | bells | 0.0034833 |
| licorice | black | 0.0034833 |
| paste | glue | 0.0034833 |
| rattle | baby | 0.0034833 |
| request | ask | 0.0034833 |
| scissors | cut | 0.0034833 |
| wellness | health | 0.0034833 |
Now let’s see if we can predict the t-score based on characteristics of the cue.
Join t.scores and cues characteristics
cues.chars = read.csv ("data/cues_chars.csv")
t.scores.full = t.scores %>%
left_join(cues.chars) %>%
select(-english, -other)
Write df
mean.t.scores <- t.scores %>%
select(cue, t) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t))
#write.csv(mean.t.scores, "data/t.scors.by.cue.filtered.csv")
Does frequency of cue predict mean t for a cue?
freq.ts = t.scores.full %>%
select(cue, associate, t, Lg10WF) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t), # mean t for each cue
Lg10WF = Lg10WF[1])
ggplot(freq.ts, aes(x = Lg10WF, y = t)) +
geom_smooth(method = "lm") +
theme_bw()
freq.ts %>%
do(tidy(cor.test(.$t, .$Lg10WF))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.0825235 | 7.897885 | 0 |
Higher frequency cues have less divergence.
Does sentiment of cue predict mean t for a cue?
quantsent.ts = t.scores.full %>%
select(cue, associate, t, quant.sent) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t), # mean t for each cue
quant.sent = quant.sent[1])
ggplot(quantsent.ts, aes(x = quant.sent, y = t)) +
geom_smooth(method = "lm") +
theme_bw()
quantsent.ts %>%
do(tidy(cor.test(.$t, .$quant.sent))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.0253056 | 0.8245423 | 0.4098168 |
No relation between sentiment of cue and t.
Does sentiment of cue predict mean t for a cue?
qual.sent.ts = t.scores.full %>%
select(cue, associate, t, qual.sent) %>%
distinct() %>%
filter(!is.na(qual.sent)) %>%
group_by(cue,qual.sent) %>%
summarize(t = mean(t)) %>% # mean t for each cue %>%
group_by(qual.sent) %>%
multi_boot_standard(column = "t")
ggplot(qual.sent.ts, aes(fill = qual.sent, y = mean, x = reorder(qual.sent, mean))) +
xlab("sentiment") +
ylab("t") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Not much going on here, small effect of negative – more difference for negative words.
Does pos of cue predict mean t for a cue?
pos.ts = t.scores.full %>%
select(cue, associate, t, pos) %>%
distinct() %>%
filter(!is.na(pos)) %>%
group_by(cue,pos) %>%
summarize(t = mean(t)) %>% # mean t for each cue
group_by(pos) %>%
multi_boot_standard(column = "t")
ggplot(pos.ts, aes(fill = pos, y = mean, x = reorder(pos, mean))) +
xlab("pos") +
ylab("t") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw() +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
Less variability for function words.
Does concreteness of cue predict mean t for a cue?
conc.ts = t.scores.full %>%
select(cue, associate, t, Conc.M) %>%
distinct() %>%
group_by(cue) %>%
summarize(t = mean(t), # mean t for each cue
conc = Conc.M[1])
ggplot(conc.ts, aes(x = conc, y = t)) +
geom_smooth(method = "lm") +
theme_bw()
conc.ts %>%
do(tidy(cor.test(.$t, .$conc))) %>%
select(estimate, statistic, p.value) %>%
kable()
| estimate | statistic | p.value |
|---|---|---|
| 0.0607458 | 5.772228 | 0 |
Small effect of concreteness in the predicted direction – Less difference for cues that are more concrete.