read in data

d.raw = read.csv("../../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  &
           nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English",
                               as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 700

lang.dems = d.clean %>%
    group_by(userID, LanguageName, native.lang) %>%
    slice(1) %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

collapse across associates

d.all.ca = d.clean %>%
  filter(LanguageName %in% lang.dems$LanguageName | LanguageName == "English") %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) %>%
  mutate(bigram = paste(cue, associate))

Participant counts

d.all.ca %>%
  select(userID, LanguageName, native.lang) %>%
  distinct() %>%
  group_by(LanguageName,native.lang ) %>%
  summarize(n=n()) %>%
  arrange(-n) %>%
  kable()
LanguageName native.lang n
English english 62412
German other 1176
Spanish other 1071

x2

from: Manning and Schuetze (1999; pg. 171)

For each cue, compare the distribtion over associates for native and non-native speakers.

x2.scores.EG <- d.all.ca %>%
  as_tibble() %>%
  filter(LanguageName == "English" | LanguageName == "German") %>%
  group_by(LanguageName, cue, associate) %>%
  summarize(n = n()) %>%
  spread(LanguageName, n) %>%
  filter(!is.na(English) & !is.na(German)) %>%
  group_by(cue) %>%
  do(x2 = chisq.test(rbind(.$English,.$German))$statistic,
     p = chisq.test(rbind(.$English,.$German))$p.value) %>%
  mutate(x2 = unlist(x2),
         p = unlist(p))
x2.scores.EG$languages = "EG"


x2.scores.ES <- d.all.ca %>%
  as_tibble() %>%
  filter(LanguageName == "English" | LanguageName == "Spanish") %>%
  group_by(LanguageName, cue, associate) %>%
  summarize(n = n()) %>%
  spread(LanguageName, n) %>%
  filter(!is.na(English) & !is.na(Spanish)) %>%
  group_by(cue) %>%
  do(x2 = chisq.test(rbind(.$English,.$Spanish))$statistic,
     p = chisq.test(rbind(.$English,.$Spanish))$p.value) %>%
  mutate(x2 = unlist(x2),
         p = unlist(p))

x2.scores.ES$languages = "ES"

x2.scores.SG <- d.all.ca %>%
  as_tibble() %>%
   filter(LanguageName == "Spanish" | LanguageName == "German") %>%
  group_by(LanguageName, cue, associate) %>%
  summarize(n = n()) %>%
  spread(LanguageName, n) %>%
  filter(!is.na(Spanish) & !is.na(German)) %>%
  group_by(cue) %>%
  do(x2 = chisq.test(rbind(.$Spanish,.$German))$statistic,
     p = chisq.test(rbind(.$Spanish,.$German))$p.value) %>%
  mutate(x2 = unlist(x2),
         p = unlist(p))
x2.scores.SG$languages = "SG"

x2.scores = rbind(x2.scores.SG, x2.scores.ES) %>%
  rbind(x2.scores.EG)

x2 scores

ggplot(x2.scores, aes(x = x2)) +
  facet_wrap(~languages) +
  geom_histogram() +
  theme_bw() +
  ggtitle("distribution of x2-scores")

Top x2 scores for English-Spanish

  x2.scores %>%
  filter(languages == "ES") %>%
  arrange(-x2) %>%
  select(cue) %>%
  slice (1:100) %>%
  as.list(.) 
## $cue
##   [1] cob          perhaps      saddle       educator     jigsaw      
##   [6] assist       woof         revolver     polar        secure      
##  [11] transform    fir          merry        willow       feline      
##  [16] forecast     grass        skim         equilibrium  crescent    
##  [21] smiling      loaf         banks        flood        paycheck    
##  [26] probable     remain       autumn       bluejay      boy         
##  [31] HIV          koala        man          cuisine      dad         
##  [36] integer      vacant       alloy        boil         library     
##  [41] foreplay     nog          puck         flipper      boulevard   
##  [46] bro          feral        investment   numerous     phony       
##  [51] erupt        almighty     cafe         examination  intellectual
##  [56] carton       invitation   spicy        spoon        sugar       
##  [61] tunes        woods        budgie       shoot        swatter     
##  [66] uncommon     bumpy        cedar        reflect      refrain     
##  [71] solo         scramble     debate       almonds      improve     
##  [76] jogging      onward       rhythm       shut         yourself    
##  [81] dialect      frequent     frugal       gooey        hard        
##  [86] perch        prod         veranda      ponytail     dressing    
##  [91] ouch         salutation   dog          ripe         come back   
##  [96] haven        speck        rhythmic     cost         orange      
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini

Top x2 scores for English-German

  x2.scores %>%
  filter(languages == "EG") %>%
  arrange(-x2) %>%
  select(cue) %>%
  slice (1:100) %>%
  as.list(.) 
## $cue
##   [1] spud         hi           either       not good     quench      
##   [6] bumble       glucose      leash        actual       scheme      
##  [11] boulder      calf         lobe         braces       loaf        
##  [16] ancient      keg          authentic    wreak        beacon      
##  [21] detest       paycheck     trout        estimate     migraine    
##  [26] Pluto        station      cops         lawn         strike      
##  [31] stump        hare         cuckoo       quail        boa         
##  [36] dad          duration     we           confidential pony        
##  [41] up           isolated     mustang      nada         annual      
##  [46] upbeat       doze         blot         whipped      chat        
##  [51] saber        salutations  assets       cardboard    communicate 
##  [56] sooner       disappointed queer        anxious      astonish    
##  [61] counterfeit  transition   exact        miner        sesame      
##  [66] there        tumble       big          immaculate   squander    
##  [71] bitch        constrictor  moat         prohibition  crossbow    
##  [76] drove        campsite     courteous    miniscule    pirate      
##  [81] vulture      groan        drop         everybody    lake        
##  [86] ouch         brevity      bungalow     hyphen       dialect     
##  [91] acquire      crow         gorilla      wicked       chuck       
##  [96] exposed      fearless     ideal        smack        triad       
## 10050 Levels: a a few a little a lot aardvark abacus abandon ... zucchini

Percentage overlap

ES.top = x2.scores %>%
  filter(languages == "ES") %>%
  group_by(cue) %>%
  arrange(-x2) %>%
  select(cue) %>%
  ungroup() %>%
  slice(1:3000)

EG.top = x2.scores %>%
  filter(languages == "EG") %>%
  group_by(cue) %>%
  arrange(-x2) %>%
  select(cue) %>%
  ungroup() %>%
  slice(1:3000)

length(intersect(ES.top$cue,EG.top$cue))/3000
## [1] 0.343

Of the top 3000 most different cues for each comparision, there’s only 34% overlap, suggesting EG and ES are different in different ways.