Question: Do second language learners have less hierarchical (i.e. “smoother”) spaces, relative to native speakers?

Data: Small World of Words dataset. In the task, participants are given a cue, and are asked to generate 3 associates. Each participant completes 15-19 trials.

Method: Construct an unweighted network of cue-association pairs (i.e. edge between cue and associate if cue produced associate) for each language group (native and non-native English speakers). For each network, measure modularity of the network using Newman (2006) algorithm. Predict that non-native speakers will have less modular spaces than native speakers, despite having smaller vocabularies.

d.raw = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")

lang.codes = read.csv("../data/language_codes.csv") %>%
  select(ISO639.2BCode, LanguageName)

d.long = d.raw %>%
  gather("association", "word", 7:9) %>%
  mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
  spread("association", "word") %>%
  rename(a1 = asso1Clean,
         a2 = asso2Clean,
         a3 = asso3Clean) 

#### d.clean Notes ####
# LANGUAGE CODES: I am excluding language codes codes that I either don't understand (e.g. eng) or are missing.
# I am asuming that all upper case codes are country codes indicating where English is spoken, but this should be verified. 
# CUES: I am excluding all cues that are two words (e.g. "head & shoulders") or are only one letter ("b")

d.clean = d.long %>%
  left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
  filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
          nativeLanguage != "fla" & nativeLanguage != "can"  & nativeLanguage != "nan"  & nativeLanguage != "pun" & nativeLanguage != "nl") %>%
  #filter(nchar(as.character(cue)) > 1) %>%
 # filter(sapply(gregexpr("[[:alpha:]]+", cue), function(x) sum(x > 0)) == 1) %>%
  mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English", as.character(LanguageName)),
         LanguageName = as.factor(LanguageName),
         country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
         country = as.factor(country),
         native.lang = ifelse(LanguageName == "English", "english", "other"),
         native.lang = as.factor(native.lang)) %>%
  select(-nativeLanguage)

Constants

# when sampling participants, how many samples to average over?
NSAMP <- 25

Demographics

demo.summary = d.clean %>%
  group_by(userID) %>%
  slice(1)

We have 71781 participants total. Based on “nativeLanguage” variable, we code each participant as native speaker of English, and non-native otherwise. We have 62412 native English speakers and 9369 non-native English speakers.

Non-native speakers are more highly educated and younger than native speakers.

demo.summary %>%
  group_by(LanguageName, native.lang) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  arrange(-n) %>%
  slice(1:25) %>%
  ggplot(aes(x = reorder(LanguageName,-n), y = n, fill = native.lang)) +
    geom_bar(stat = "identity") +
    xlab("language") +
    theme_bw(base_size = 15) +
    theme(legend.position="none",
    axis.text.x = element_text(angle = 90, hjust = 1))

demo.summary %>%
  group_by(native.lang, education) %>%
  summarise(n = n()) %>%
  mutate(prop_participants = n / sum(n),
         education = as.factor(education)) %>%
  ggplot(aes(x = education, y = prop_participants, group = native.lang, 
             fill = native.lang)) +
    geom_bar(stat = "identity", position = "dodge") +
    ggtitle("Education distribution by language") +
    theme_bw()

demo.summary %>%
  ggplot(aes(x = age, group = native.lang, fill = native.lang)) +
    geom_density(alpha = .4) +
    ggtitle("Age distribution by language") +
    theme_bw()

Modularity: native vs. non-native english speakers

Basic analysis

Get edges, collapsing across three associates.

# get df collapasing across all associates
d.all.ca = d.clean %>%
  ungroup() %>%
  gather("associate.type", "associate", 6:8) %>%
  filter(cue != "NA" & associate != "NA" ) 

d.all.ca %>%
  group_by(native.lang) %>%
  distinct(cue, associate, .keep_all = TRUE)  %>%
  summarise(n = n()) %>%
  kable(caption = "Total number of distinct cue-association pairs per group")

Total number of distinct cue-association pairs per group
native.lang	n
english	1084756
other	263392

Compute modularity for native and non-native participants using igraph::cluster_leading_eigen algorithm (Newman, 2006). Note that this is the same as igraph::leading.eigenvector.community.

get_modularity <- function(df){
    clusters <- df %>%
        select(cue, associate) %>%
        graph_from_data_frame(directed = FALSE) %>% # first 2 cols as source/target for each edge
        simplify() %>% # remove duplicates and loops (dog -> cat & cat -> dog; dog -> dog)
        cluster_leading_eigen(options = list(maxiter=1000000))
    
    data.frame(Q = round(clusters$modularity,2), 
               n.groups = round(length(clusters),2))
}

basic.analysis = d.all.ca %>%
  group_by(native.lang) %>%
  do(get_modularity(.)) 

kable(basic.analysis)

native.lang	Q	n.groups
english	0.26	4
other	0.28	4

Under this analysis, non-native slightly more modular.

Down-sampling ns

The above analysis isn’t totally fair though because there are dramatically different numbers of participants between the two groups. Here we down-sample the native group so that the number of participants in the two groups are the same.

# Get dataframe of english speakers only for down-sampling
d.all.ca.eng =  filter(d.all.ca, native.lang == "english") 

# Sample N participants from dataframe and get modularity
mod_one_sample <- function(df, num_participants) {
  sampled_ids = sample(unique(df$userID), num_participants)
  
  df %>%
    filter(userID %in% sampled_ids) %>%
    do(get_modularity(.)) 
}

# Number of non-native participants for determine size of downsample for native speakers
N_NONNATIVE_PARTICIPANTS <- d.clean %>%
  ungroup() %>%
  filter(native.lang == "other") %>%
  summarize(n = length(unique(userID))) %>%
  as.numeric()

# Down-sample and estimate modularity NSAMP times. Then take mean modularity across samples.
downsampled.native = 1:NSAMP %>%
  map(function(n) mod_one_sample(d.all.ca.eng, N_NONNATIVE_PARTICIPANTS)) %>%
  bind_rows() %>% 
  mutate(native.lang = "downsampled.native") %>%
  group_by(native.lang) %>%
  multi_boot_standard(column = "Q", na.rm = T) %>%
  rename(Q = mean)

basic.analysis %>% 
  filter(native.lang == "other") %>%
  select(-n.groups) %>%
  mutate(ci_lower = NA,
         ci_upper = NA) %>%
  as.data.frame() %>%
  rbind(downsampled.native) %>%
  kable()

native.lang	Q	ci_lower	ci_upper
other	0.2800	NA	NA
downsampled.native	0.2384	0.19319	0.2756

Modularity: native english speakers vs. individual second-language groups

Rather than pooling all non-native speakers together, compare each language group to USA-English speakers. This is closer to the central hypothesis–that it’s shared experience driving differences–rather than just 2nd language-ness. Will need to down-sample English speakers here too.

# Usa-english-only dataset
d.all.ca.usa.eng = filter(d.all.ca, country == "USA") 

# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 500

lang.dems = demo.summary %>%
    group_by(LanguageName, native.lang) %>%
    summarise(n = n()) %>%
    filter(native.lang == "other") %>%
    arrange(-n) %>%
    filter(n > N_CUTOFF) %>%
    select(-native.lang) %>%
    ungroup()

# Given the n for a language, sample n participants from english dataframe and get modularity for english
mod_one_sample_lang <- function(df, lang) {
  
  # given a language, get the n_nonnative_participantsfor that lang
  n_nonnative_participants = lang.dems %>%
                   filter(LanguageName == lang) %>%
                   select(n) %>%
                   as.numeric()
  
  # sample n_nonnative_participants from english dataframe
  sampled_ids = sample(unique(df$userID), n_nonnative_participants)
  
  df %>%
    filter(userID %in% sampled_ids) %>%
    do(get_modularity(.)) 
}

# df for looping over language population sizes and samples
is = expand.grid(lang = lang.dems$LanguageName, samp = 1:NSAMP) %>%
  left_join(lang.dems, by = c("lang" = "LanguageName"))

# Down-sample English and estimate modularity NSAMP times, for each language size. Then take mean modularity across samples for each langauge.
sampled_native_mod = map2_df(is$samp, is$lang, 
                                    function(n, lang) mod_one_sample_lang(d.all.ca.usa.eng, lang)) %>%
  cbind(is) %>%
  rename(lang_equiv_size = lang) %>%
  group_by(lang_equiv_size) %>%
  multi_boot_standard(column = "Q", na.rm = T) %>%
  mutate(nativeLanguage = "eng.USA") %>%
  rename(Q = mean)

Now, get the modularity for the language samples

non_native_mod = d.all.ca %>%
  filter(LanguageName %in% lang.dems$LanguageName) %>%
  group_by(LanguageName) %>%
  do(get_modularity(.)) %>%
  mutate(ci_lower = NA,
         ci_upper = NA,
         lang_equiv_size = LanguageName) %>%
  select(-n.groups)

non_native_mod %>%
  bind_rows(sampled_native_mod) %>%
  mutate(native = ifelse(is.na(nativeLanguage), "non-native",
                         "down.sampled.native(eng.USA)")) %>%
  ggplot(aes(fill = native, y = Q, x = lang_equiv_size)) +
    xlab("language") +
    geom_bar(stat = "identity", position = "dodge") +
    geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
                 position = position_dodge(width = .9)) +
    theme_bw()

Modularity: across age in US sample

With more experience–that is, enculteration–do speakers look more modular? In other words, do they make less weird errors as they have more common ground? Here we can look at USA-English speakers only.

Median split on age

MEDIAN_AGE <- median(d.all.ca.usa.eng$age)

d.all.ca.usa.eng %>%
  mutate(below.median.age = ifelse(age < MEDIAN_AGE, 1, 0)) %>%
  group_by(below.median.age) %>%
  do(get_modularity(.)) %>%
  kable()

below.median.age	Q	n.groups
0	0.30	4
1	0.27	4

Participants who are older are more modular.

5-year binned analysis

Sampling participants based on size of min bin

# Get bins; this was selected somewhat arbitrary to maximize range as well as min number of participants
d.all.ca.usa.eng = d.all.ca.usa.eng %>%
  mutate(age.bin = cut(age, seq(15,65,5))) 

# Given min_participants, sample participants, and calculate modularity for each age group
mod_one_sample_age <- function(dataframe, age, n_participants) {
  age2 <- age # this is a bug, not sure why this is necessary

  d = filter(dataframe, age.bin == age2)

  # sample n_nonnative_participants from age dataframe
  sampled_ids = sample(unique(d$userID), n_participants)
  
  d %>%
    filter(userID %in% sampled_ids) %>%
    do(get_modularity(.)) 
}

# Number of participants in smallest age group
MIN_PARTICIPANTS_IN_AGE_GROUP <- d.all.ca.usa.eng %>%
  group_by(age.bin) %>%
  summarize(n = length(unique(userID))) %>%
  arrange(n) %>%
  select(n) %>%
  slice(1) %>%
  as.numeric()

# df for looping over language population sizes and samples
is = expand.grid(age.bin = levels(d.all.ca.usa.eng$age.bin), 
                 samp = 1:NSAMP) 

sampled.mod.age = map2_df(is$samp, is$age.bin,
                          function(n, age.bin) mod_one_sample_age(d.all.ca.usa.eng, age.bin, MIN_PARTICIPANTS_IN_AGE_GROUP)) %>%
  bind_rows() %>% 
  cbind(is) %>%
  group_by(age.bin) %>%
  multi_boot_standard(column = "Q", na.rm = T)

sampled.mod.age %>%
  ggplot(aes(x = age.bin, y = mean, group = 1)) +
    geom_point() +
    geom_line() +
    geom_ribbon(aes(ymax = ci_upper, ymin=ci_lower), alpha = .1) +
    theme_bw()