Question: Do second language learners have less hierarchical (i.e. “smoother”) spaces, relative to native speakers?
Data: Small World of Words dataset. In the task, participants are given a cue, and are asked to generate 3 associates. Each participant completes 15-19 trials.
Method: Construct an unweighted network of cue-association pairs (i.e. edge between cue and associate if cue produced associate) for each language group (native and non-native English speakers). For each network, measure modularity of the network using Newman (2006) algorithm. Predict that non-native speakers will have less modular spaces than native speakers, despite having smaller vocabularies.
d.raw = read.csv("../data/associations_ppdetails_en_05_01_2015.csv")
lang.codes = read.csv("../data/language_codes.csv") %>%
select(ISO639.2BCode, LanguageName)
d.long = d.raw %>%
gather("association", "word", 7:9) %>%
mutate(word = gsub("\\bx\\b", "NA", word)) %>% # remove missing words
spread("association", "word") %>%
rename(a1 = asso1Clean,
a2 = asso2Clean,
a3 = asso3Clean)
#### d.clean Notes ####
# LANGUAGE CODES: I am excluding language codes codes that I either don't understand (e.g. eng) or are missing.
# I am asuming that all upper case codes are country codes indicating where English is spoken, but this should be verified.
# CUES: I am excluding all cues that are two words (e.g. "head & shoulders") or are only one letter ("b")
d.clean = d.long %>%
left_join(lang.codes, by = c("nativeLanguage" = "ISO639.2BCode")) %>%
filter(nativeLanguage != "eng" & nativeLanguage != "" & nativeLanguage != "99" &
nativeLanguage != "fla" & nativeLanguage != "can" & nativeLanguage != "nan" & nativeLanguage != "pun" & nativeLanguage != "nl") %>%
#filter(nchar(as.character(cue)) > 1) %>%
# filter(sapply(gregexpr("[[:alpha:]]+", cue), function(x) sum(x > 0)) == 1) %>%
mutate(LanguageName = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), "English", as.character(LanguageName)),
LanguageName = as.factor(LanguageName),
country = ifelse(grepl("^[[:upper:]]+$", nativeLanguage), nativeLanguage, NA),
country = as.factor(country),
native.lang = ifelse(LanguageName == "English", "english", "other"),
native.lang = as.factor(native.lang)) %>%
select(-nativeLanguage)
Constants
# when sampling participants, how many samples to average over?
NSAMP <- 25
demo.summary = d.clean %>%
group_by(userID) %>%
slice(1)
We have 71781 participants total. Based on “nativeLanguage” variable, we code each participant as native speaker of English, and non-native otherwise. We have 62412 native English speakers and 9369 non-native English speakers.
Non-native speakers are more highly educated and younger than native speakers.
demo.summary %>%
group_by(LanguageName, native.lang) %>%
summarise(n = n()) %>%
ungroup() %>%
arrange(-n) %>%
slice(1:25) %>%
ggplot(aes(x = reorder(LanguageName,-n), y = n, fill = native.lang)) +
geom_bar(stat = "identity") +
xlab("language") +
theme_bw(base_size = 15) +
theme(legend.position="none",
axis.text.x = element_text(angle = 90, hjust = 1))
demo.summary %>%
group_by(native.lang, education) %>%
summarise(n = n()) %>%
mutate(prop_participants = n / sum(n),
education = as.factor(education)) %>%
ggplot(aes(x = education, y = prop_participants, group = native.lang,
fill = native.lang)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Education distribution by language") +
theme_bw()
demo.summary %>%
ggplot(aes(x = age, group = native.lang, fill = native.lang)) +
geom_density(alpha = .4) +
ggtitle("Age distribution by language") +
theme_bw()
Get edges, collapsing across three associates.
# get df collapasing across all associates
d.all.ca = d.clean %>%
ungroup() %>%
gather("associate.type", "associate", 6:8) %>%
filter(cue != "NA" & associate != "NA" )
d.all.ca %>%
group_by(native.lang) %>%
distinct(cue, associate, .keep_all = TRUE) %>%
summarise(n = n()) %>%
kable(caption = "Total number of distinct cue-association pairs per group")
| native.lang | n |
|---|---|
| english | 1084756 |
| other | 263392 |
Compute modularity for native and non-native participants using igraph::cluster_leading_eigen algorithm (Newman, 2006). Note that this is the same as igraph::leading.eigenvector.community.
get_modularity <- function(df){
clusters <- df %>%
select(cue, associate) %>%
graph_from_data_frame(directed = FALSE) %>% # first 2 cols as source/target for each edge
simplify() %>% # remove duplicates and loops (dog -> cat & cat -> dog; dog -> dog)
cluster_leading_eigen(options = list(maxiter=1000000))
data.frame(Q = round(clusters$modularity,2),
n.groups = round(length(clusters),2))
}
basic.analysis = d.all.ca %>%
group_by(native.lang) %>%
do(get_modularity(.))
kable(basic.analysis)
| native.lang | Q | n.groups |
|---|---|---|
| english | 0.26 | 4 |
| other | 0.28 | 4 |
Under this analysis, non-native slightly more modular.
The above analysis isn’t totally fair though because there are dramatically different numbers of participants between the two groups. Here we down-sample the native group so that the number of participants in the two groups are the same.
# Get dataframe of english speakers only for down-sampling
d.all.ca.eng = filter(d.all.ca, native.lang == "english")
# Sample N participants from dataframe and get modularity
mod_one_sample <- function(df, num_participants) {
sampled_ids = sample(unique(df$userID), num_participants)
df %>%
filter(userID %in% sampled_ids) %>%
do(get_modularity(.))
}
# Number of non-native participants for determine size of downsample for native speakers
N_NONNATIVE_PARTICIPANTS <- d.clean %>%
ungroup() %>%
filter(native.lang == "other") %>%
summarize(n = length(unique(userID))) %>%
as.numeric()
# Down-sample and estimate modularity NSAMP times. Then take mean modularity across samples.
downsampled.native = 1:NSAMP %>%
map(function(n) mod_one_sample(d.all.ca.eng, N_NONNATIVE_PARTICIPANTS)) %>%
bind_rows() %>%
mutate(native.lang = "downsampled.native") %>%
group_by(native.lang) %>%
multi_boot_standard(column = "Q", na.rm = T) %>%
rename(Q = mean)
basic.analysis %>%
filter(native.lang == "other") %>%
select(-n.groups) %>%
mutate(ci_lower = NA,
ci_upper = NA) %>%
as.data.frame() %>%
rbind(downsampled.native) %>%
kable()
| native.lang | Q | ci_lower | ci_upper |
|---|---|---|---|
| other | 0.2800 | NA | NA |
| downsampled.native | 0.2384 | 0.19319 | 0.2756 |
Rather than pooling all non-native speakers together, compare each language group to USA-English speakers. This is closer to the central hypothesis–that it’s shared experience driving differences–rather than just 2nd language-ness. Will need to down-sample English speakers here too.
# Usa-english-only dataset
d.all.ca.usa.eng = filter(d.all.ca, country == "USA")
# Get second languages and num participants, with greater than N_CUTOFF participants
N_CUTOFF <- 500
lang.dems = demo.summary %>%
group_by(LanguageName, native.lang) %>%
summarise(n = n()) %>%
filter(native.lang == "other") %>%
arrange(-n) %>%
filter(n > N_CUTOFF) %>%
select(-native.lang) %>%
ungroup()
# Given the n for a language, sample n participants from english dataframe and get modularity for english
mod_one_sample_lang <- function(df, lang) {
# given a language, get the n_nonnative_participantsfor that lang
n_nonnative_participants = lang.dems %>%
filter(LanguageName == lang) %>%
select(n) %>%
as.numeric()
# sample n_nonnative_participants from english dataframe
sampled_ids = sample(unique(df$userID), n_nonnative_participants)
df %>%
filter(userID %in% sampled_ids) %>%
do(get_modularity(.))
}
# df for looping over language population sizes and samples
is = expand.grid(lang = lang.dems$LanguageName, samp = 1:NSAMP) %>%
left_join(lang.dems, by = c("lang" = "LanguageName"))
# Down-sample English and estimate modularity NSAMP times, for each language size. Then take mean modularity across samples for each langauge.
sampled_native_mod = map2_df(is$samp, is$lang,
function(n, lang) mod_one_sample_lang(d.all.ca.usa.eng, lang)) %>%
cbind(is) %>%
rename(lang_equiv_size = lang) %>%
group_by(lang_equiv_size) %>%
multi_boot_standard(column = "Q", na.rm = T) %>%
mutate(nativeLanguage = "eng.USA") %>%
rename(Q = mean)
Now, get the modularity for the language samples
non_native_mod = d.all.ca %>%
filter(LanguageName %in% lang.dems$LanguageName) %>%
group_by(LanguageName) %>%
do(get_modularity(.)) %>%
mutate(ci_lower = NA,
ci_upper = NA,
lang_equiv_size = LanguageName) %>%
select(-n.groups)
non_native_mod %>%
bind_rows(sampled_native_mod) %>%
mutate(native = ifelse(is.na(nativeLanguage), "non-native",
"down.sampled.native(eng.USA)")) %>%
ggplot(aes(fill = native, y = Q, x = lang_equiv_size)) +
xlab("language") +
geom_bar(stat = "identity", position = "dodge") +
geom_linerange(aes(ymax = ci_upper, ymin=ci_lower),
position = position_dodge(width = .9)) +
theme_bw()
With more experience–that is, enculteration–do speakers look more modular? In other words, do they make less weird errors as they have more common ground? Here we can look at USA-English speakers only.
MEDIAN_AGE <- median(d.all.ca.usa.eng$age)
d.all.ca.usa.eng %>%
mutate(below.median.age = ifelse(age < MEDIAN_AGE, 1, 0)) %>%
group_by(below.median.age) %>%
do(get_modularity(.)) %>%
kable()
| below.median.age | Q | n.groups |
|---|---|---|
| 0 | 0.30 | 4 |
| 1 | 0.27 | 4 |
Participants who are older are more modular.
Sampling participants based on size of min bin
# Get bins; this was selected somewhat arbitrary to maximize range as well as min number of participants
d.all.ca.usa.eng = d.all.ca.usa.eng %>%
mutate(age.bin = cut(age, seq(15,65,5)))
# Given min_participants, sample participants, and calculate modularity for each age group
mod_one_sample_age <- function(dataframe, age, n_participants) {
age2 <- age # this is a bug, not sure why this is necessary
d = filter(dataframe, age.bin == age2)
# sample n_nonnative_participants from age dataframe
sampled_ids = sample(unique(d$userID), n_participants)
d %>%
filter(userID %in% sampled_ids) %>%
do(get_modularity(.))
}
# Number of participants in smallest age group
MIN_PARTICIPANTS_IN_AGE_GROUP <- d.all.ca.usa.eng %>%
group_by(age.bin) %>%
summarize(n = length(unique(userID))) %>%
arrange(n) %>%
select(n) %>%
slice(1) %>%
as.numeric()
# df for looping over language population sizes and samples
is = expand.grid(age.bin = levels(d.all.ca.usa.eng$age.bin),
samp = 1:NSAMP)
sampled.mod.age = map2_df(is$samp, is$age.bin,
function(n, age.bin) mod_one_sample_age(d.all.ca.usa.eng, age.bin, MIN_PARTICIPANTS_IN_AGE_GROUP)) %>%
bind_rows() %>%
cbind(is) %>%
group_by(age.bin) %>%
multi_boot_standard(column = "Q", na.rm = T)
sampled.mod.age %>%
ggplot(aes(x = age.bin, y = mean, group = 1)) +
geom_point() +
geom_line() +
geom_ribbon(aes(ymax = ci_upper, ymin=ci_lower), alpha = .1) +
theme_bw()
Participants who are older are more modular.