read in data
d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")
lang.codes = read.csv("../../data/language_codes.csv") %>%
select(ISO639.2BCode, LanguageName)
d.long = d.raw %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
d.clean = d.long %>%
left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
nativeLanguage != "fla" & nativeLanguage != "can" & nativeLanguage != "nan" &
nativeLanguage != "pun" & nativeLanguage != "nl") %>%
mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
as.character(LanguageName)),
LanguageName = as.factor(LanguageName),
country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
country = as.factor(country),
native.lang = ifelse(LanguageName == "English", "english", "other"),
native.lang = as.factor(native.lang)) %>%
select(-nativeLanguage)
# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 700
lang.dems = d.clean %>%
group_by(userID, LanguageName, native.lang) %>%
slice(1) %>%
group_by(LanguageName, native.lang) %>%
summarise(n = n()) %>%
filter(native.lang == "other") %>%
arrange(-n) %>%
filter(n > N_CUTOFF) %>%
select(-native.lang) %>%
ungroup()
collapse across associates
d.all.ca = d.clean %>%
filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
ungroup() %>%
gather("associate.type", "associate", 6:8) %>%
filter(cue != "NA" & associate != "NA" ) %>%
mutate(bigram = paste(cue, associate))
d.all.ca %>%
select(userID, LanguageName, native.lang) %>%
distinct() %>%
group_by(LanguageName,native.lang ) %>%
summarize(n=n()) %>%
arrange(-n) %>%
kable()
| LanguageName | native.lang | n |
|---|---|---|
| English | english | 62412 |
| German | other | 1176 |
| Spanish | other | 1071 |
from: Manning and Schuetze (1999; pg. 171)
For each cue, compare the distribtion over associates for native and non-native speakers.
x2.scores.EG <- d.all.ca %>%
as_tibble() %>%
filter(LanguageName == "English" | LanguageName == "German") %>%
group_by(LanguageName, cue, associate) %>%
summarize(n = n()) %>%
spread(LanguageName, n) %>%
filter(!is.na(English) & !is.na(German)) %>%
group_by(cue) %>%
do(x2 = chisq.test(rbind(.$English,.$German))$statistic,
p = chisq.test(rbind(.$English,.$German))$p.value) %>%
mutate(x2 = unlist(x2),
p = unlist(p))
x2.scores.EG$languages = "EG"
x2.scores.ES <- d.all.ca %>%
as_tibble() %>%
filter(LanguageName == "English" | LanguageName == "Spanish") %>%
group_by(LanguageName, cue, associate) %>%
summarize(n = n()) %>%
spread(LanguageName, n) %>%
filter(!is.na(English) & !is.na(Spanish)) %>%
group_by(cue) %>%
do(x2 = chisq.test(rbind(.$English,.$Spanish))$statistic,
p = chisq.test(rbind(.$English,.$Spanish))$p.value) %>%
mutate(x2 = unlist(x2),
p = unlist(p))
x2.scores.ES$languages = "ES"
x2.scores.SG <- d.all.ca %>%
as_tibble() %>%
filter(LanguageName == "Spanish" | LanguageName == "German") %>%
group_by(LanguageName, cue, associate) %>%
summarize(n = n()) %>%
spread(LanguageName, n) %>%
filter(!is.na(Spanish) & !is.na(German)) %>%
group_by(cue) %>%
do(x2 = chisq.test(rbind(.$Spanish,.$German))$statistic,
p = chisq.test(rbind(.$Spanish,.$German))$p.value) %>%
mutate(x2 = unlist(x2),
p = unlist(p))
x2.scores.SG$languages = "SG"
x2.scores = rbind(x2.scores.SG, x2.scores.ES) %>%
rbind(x2.scores.EG)
x2 scores
ggplot(x2.scores, aes(x = x2)) +
facet_wrap(~languages) +
geom_histogram() +
theme_bw() +
ggtitle("distribution of x2-scores")
Top x2 scores for English-Spanish
x2.scores %>%
filter(languages == "ES") %>%
arrange(-x2) %>%
select(cue) %>%
slice (1:100) %>%
as.list(.)
## $cue
## [1] cob perhaps saddle educator jigsaw
## [6] assist woof revolver polar secure
## [11] transform fir merry willow feline
## [16] forecast grass skim equilibrium crescent
## [21] smiling loaf banks flood paycheck
## [26] probable remain autumn bluejay boy
## [31] HIV koala man cuisine dad
## [36] integer vacant alloy boil library
## [41] foreplay nog puck flipper boulevard
## [46] bro feral investment numerous phony
## [51] erupt almighty cafe examination intellectual
## [56] carton invitation spicy spoon sugar
## [61] tunes woods budgie shoot swatter
## [66] uncommon bumpy cedar reflect refrain
## [71] solo scramble debate almonds improve
## [76] jogging onward rhythm shut yourself
## [81] dialect frequent frugal gooey hard
## [86] perch prod veranda ponytail dressing
## [91] ouch salutation dog ripe come back
## [96] haven speck rhythmic cost orange
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini
Top x2 scores for English-German
x2.scores %>%
filter(languages == "EG") %>%
arrange(-x2) %>%
select(cue) %>%
slice (1:100) %>%
as.list(.)
## $cue
## [1] spud hi either not good quench
## [6] bumble glucose leash actual scheme
## [11] boulder calf lobe braces loaf
## [16] ancient keg authentic wreak beacon
## [21] detest paycheck trout estimate migraine
## [26] Pluto station cops lawn strike
## [31] stump hare cuckoo quail boa
## [36] dad duration we confidential pony
## [41] up isolated mustang nada annual
## [46] upbeat doze blot whipped chat
## [51] saber salutations assets cardboard communicate
## [56] sooner disappointed queer anxious astonish
## [61] counterfeit transition exact miner sesame
## [66] there tumble big immaculate squander
## [71] bitch constrictor moat prohibition crossbow
## [76] drove campsite courteous miniscule pirate
## [81] vulture groan drop everybody lake
## [86] ouch brevity bungalow hyphen dialect
## [91] acquire crow gorilla wicked chuck
## [96] exposed fearless ideal smack triad
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini
ES.top = x2.scores %>%
filter(languages == "ES") %>%
group_by(cue) %>%
arrange(-x2) %>%
select(cue) %>%
ungroup() %>%
slice(1:3000)
EG.top = x2.scores %>%
filter(languages == "EG") %>%
group_by(cue) %>%
arrange(-x2) %>%
select(cue) %>%
ungroup() %>%
slice(1:3000)
length(intersect(ES.top$cue,EG.top$cue))/3000
## [1] 0.343
Of the top 3000 most different cues for each comparision, there’s only 34% overlap, suggesting EG and ES are different in different ways.