Social data from Lupyan and Dale (2010)
Read in Lupyan and Dale (2010) and Complexity Data
### L&D data
d = read.table("data/LD_plos_data.txt", fill = T, header = T, sep = "\t", na.strings = "*")
# fix language labels
d$ethnologue_lang_name = tolower(d$ethnologue_lang_name)
d$ethnologue_lang_name[d$ethnologue_lang_name == "standard german"] <- "german"
d$ethnologue_lang_name[d$ethnologue_lang_name == "tosk albanian"] <- "albanian"
d$ethnologue_lang_name[d$ethnologue_lang_name == "catalan-valencian-balear"] <- "catalan"
d$ethnologue_lang_name[d$ethnologue_lang_name == "haitian creole french"] <- "haitian.creole"
d$ethnologue_lang_name[d$ethnologue_lang_name == "irish gaelic"] <- "irish"
d$ethnologue_lang_name[d$ethnologue_lang_name == "central khmer"] <- "khmer"
# Need to collapse across different dialects of same language (e.g. "eastern mongolian" and "peripherial mongolian") [arabic, azerbaijani, chinese, hmong, mongolian, yiddish]
# but first, remove sign languages (e.g."chinese sign language")
d = d[-which(grepl("sign language", d$ethnologue_lang_name)),]
d$eln2 = ifelse(grepl("arabic", d$ethnologue_lang_name), "arabic",
ifelse(grepl("azerbaijani", d$ethnologue_lang_name), "azerbaijani",
ifelse(grepl("chinese", d$ethnologue_lang_name), "chinese",
ifelse(grepl("hmong", d$ethnologue_lang_name), "hmong",
ifelse(grepl("mongolian", d$ethnologue_lang_name), "mongolian",
ifelse(grepl("yiddish", d$ethnologue_lang_name), "yiddish",
d$ethnologue_lang_name))))))
d$eln2= as.factor(d$eln2)
# 12 langs not in pops [belarusian, bosnian, cebuana, croatian, esperanto, filipino, kanada, latin, norwegian, persian, punjabi, serbian]
### Complexity data
c_l = read.csv("data/xling_cors.csv")
names(c_l)[which(names(c_l) == "language")] = "eln2"
names(c_l)[which(names(c_l) == "corr")] = "complexity.bias"
names(c_l)[which(names(c_l) == "p.corr")] = "p.complexity.bias"
names(c_l)[which(names(c_l) == "mono.cor")] = "mono.complexity.bias"
names(c_l)[which(names(c_l) == "open.cor")] = "open.complexity.bias"
Merge LD demographic/geographic variables (quantitative) with complexity data
# aggregate across countries to get quantitative measures by language
d_copy = d
demo = d %>%
group_by(eln2) %>%
summarise_each(funs(mean(., na.rm = TRUE)), c(8:9, 16:121))
demo = d_copy %>%
group_by(eln2) %>%
filter(row_number() == 1) %>%
select(eln2, langFamily, langGenus) %>%
left_join(demo, by = "eln2") # add in language family
# merge with complexity data
clp = c_l %>%
left_join(demo, by = "eln2") %>%
mutate(log.max_lang_population = log(max_lang_population),
log.area = log(area),
log.perimeter = log(perimeter),
log.numNeighbors = log(numNeighbors),
log.numStations = log(numStations),
log.sdTemp = log(sdTemp)) %>%
select(-max_lang_population, -area, -langCountryPop,
-perimeter, -numNeighbors, -numStations,
-sdTemp, -drycas2, -logpop2, -max_lang_populationFlooredAt50)
# A note on population variables:
# for our set of 80 langaues, max_lang_population == max_lang_populationFlooredAt50 and logpop == logpop2.
# the logpop variables are not the same as log.max_lang_population, not sure where these variables come from, maybe mean population? (rather than sum)
# remove infs
clp = do.call(data.frame,lapply(clp, function(x)
replace(x, is.infinite(x),NA)))
Plot complexity bias against all demographic/geographic variables
con.plot <- function(df, x) {
# plot setup
df$x <- df[, x]
ypos = min(df$x, na.rm = T) +
(max(df$x, na.rm = T)-min(df$x, na.rm = T))*.75
# get corr significance
sig = NA
sig[1] = cor.test(df$complexity.bias,
df$x, use = "complete")$p.value < .05
sig[2] = cor.test(df$p.complexity.bias,
df$x, use = "complete")$p.value < .05
sig[3] = cor.test(df$mono.complexity.bias,
df$x, use = "complete")$p.value < .05
sig[4] = cor.test(df$open.complexity.bias,
df$x, use = "complete")$p.value < .05
sig_char = ""
if (sig[1]) {
sig_char = paste(sig_char, "*", sep = "")
}
if (sig[2]) {
sig_char = paste(sig_char, "f", sep = "")
}
if (sig[3]) {
sig_char = paste(sig_char, "m", sep = "")
}
if (sig[4]) {
sig_char = paste(sig_char, "o", sep = "")
}
# plot
ggplot(df, aes(y = complexity.bias, x = x)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
ylab("CB") +
xlab(x) +
annotate("text", x = ypos, y = .6, color = "red", size = 4,
label = paste("r=",
round(cor(df$complexity.bias,
df$x,
use = "complete"), 2),
sig_char,
sep = "")) +
themeML
}
clp.plot = clp[,c(-1:-4, -6, -10:-12, -15:-101)]
demoVarNames = names(clp.plot)[-1:-4] # remove complexity.bias corrs
# make plots
plots=list(NA)
for (i in 1:length(demoVarNames)){
plots[[i]] = con.plot(clp.plot, demoVarNames[i])
}
#quartz()
multiplot(plotlist = plots, cols = 3)
Merge morphological L&D variables (qualitative) with complexity data
# aggregate across countries to get quantitative measures by language
most.frequent.level = function (x){
mf = names(which.max(table(x)))
return(mf)
}
# make morphological features factors
d[,18:104] <- colwise(as.factor)(d[,18:104])
# aggregate L&D data by language
qual = d %>%
select(-1:-3, -5:-9, -13, -15:-17, -104:-122) %>%
group_by(eln2) %>%
summarise_each(funs(most.frequent.level))
# merge in complexity data
qualp = c_l %>%
left_join(qual, by="eln2") %>%
select(-1:-4, -6:-9)
# make everything factors
qualp[,2:92] <- colwise(as.factor)(qualp[,2:92])
qualp_copy = qualp
Plot complexity bias against all morphological features
#Plotting function
qual.plot <- function(df, lingFactor, labs) {
df$lingFactor <- df[,lingFactor]
ms = df %>%
filter(!is.na(lingFactor)) %>%
multi_boot(column = "complexity.bias",
summary_groups = lingFactor,
#statistics_functions = c("mean"),
statistics_functions = c("mean", "ci_lower","ci_upper"))
# make factor levels in ascending order
ms[[lingFactor]] = as.factor(ms[[lingFactor]])
ord = sort(ms$mean, index.return = T)$ix
ms[[lingFactor]] = factor(ms[[lingFactor]], levels(ms[[lingFactor]])[ord])
# get title
title = labs[labs$Name == lingFactor, "Description"]
# plot
ggplot(ms, aes_string(y = "mean", x = lingFactor, fill = lingFactor)) +
geom_bar(position = "dodge", stat = "identity") +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper),
width = 0.2, position = "dodge") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none") +
ylab("CB") +
ggtitle(title) +
themeML
}
VARIABLES WE HAVE HYPOTHESES ABOUT
# get rid of vars we don't care about to make more manageable
dontCare = c("complexity.bias", "checked", "continent",
"nativeCountry", "nativeCountryArea", "langGenus")
qualVarNames = setdiff(names(qualp), c(dontCare))
# include
labNames= read.csv("data/feature_names.csv")
toInclude = labNames[labNames$include == 1, ]
qualVarNames = intersect(qualVarNames, toInclude$Name)
# Some vars are missing from L&D data set ["HASIND" "GILDIF" "AUWPRH" "DRYSBV" "DRYOBV" "DRYXOV" "DRYREL" "POLANT" "POLAPP" "SONPER" "SONNON" "NICMTP" "NICNMP" "DAHTEA"]
# remap factor levels to human readable values (not all present)
labMappings = read.csv("data/labMappings.csv")
for (i in 1:length(qualVarNames)){
thisVarLabs = labMappings[labMappings$featureName == qualVarNames[i],]
old = thisVarLabs$oldLab
new = thisVarLabs$newLab
col_i = grep(qualVarNames[i], colnames(qualp))
qualp[,col_i] = mapvalues(qualp[,col_i], from = as.character(old), to = as.character(new))
}
featureCats = levels(droplevels(toInclude$Feature.Class))
Morphology
qualVarNames.morph = intersect(toInclude[toInclude$Feature.Class == "Morphology", "Name"], qualVarNames)
qualp.morph = qualp[, c("complexity.bias", qualVarNames.morph)]
plots = list(NA)
for (i in 1:length(qualVarNames.morph )){
plots[[i]] = qual.plot(qualp.morph, qualVarNames.morph[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Nominal Categories
qualVarNames.NC = intersect(toInclude[toInclude$Feature.Class ==
"Nominal Categories", "Name"], qualVarNames)
qualp.NC = qualp[, c("complexity.bias", qualVarNames.NC)]
plots = list(NA)
for (i in 1:length(qualVarNames.NC)){
plots[[i]] = qual.plot(qualp.NC, qualVarNames.NC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Nominal Syntax
qualVarNames.NS = intersect(toInclude[toInclude$Feature.Class == "Nominal Syntax", "Name"], qualVarNames)
qualp.NS = qualp[, c("complexity.bias", qualVarNames.NS)]
plots = list(NA)
for (i in 1:length(qualVarNames.NS)){
plots[[i]] = qual.plot(qualp.NS, qualVarNames.NS[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Semantic Lexicon
qualVarNames.SL = intersect(toInclude[toInclude$Feature.Class == "Semantic Lexicon", "Name"], qualVarNames)
qualp.SL = qualp[, c("complexity.bias", qualVarNames.SL)]
plots = list(NA)
for (i in 1:length(qualVarNames.SL)){
plots[[i]] = qual.plot(qualp.SL, qualVarNames.SL[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Simple Clauses
qualVarNames.SC = intersect(toInclude[toInclude$Feature.Class == "Simple Clauses", "Name"], qualVarNames)
qualp.SC = qualp[, c("complexity.bias", qualVarNames.SC)]
plots = list(NA)
for (i in 1:length(qualVarNames.SC)){
plots[[i]] = qual.plot(qualp.SC, qualVarNames.SC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Verbal Categories
qualVarNames.VC = intersect(toInclude[toInclude$Feature.Class == "Verbal Categories", "Name"], qualVarNames)
qualp.VC = qualp[, c("complexity.bias", qualVarNames.VC)]
plots = list(NA)
for (i in 1:length(qualVarNames.VC)){
plots[[i]] = qual.plot(qualp.VC, qualVarNames.VC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Word order
qualVarNames.WO = intersect(toInclude[toInclude$Feature.Class == "Word Order", "Name"], qualVarNames)
qualp.WO = qualp[, c("complexity.bias", qualVarNames.WO)]
plots = list(NA)
for (i in 1:length(qualVarNames.WO)){
plots[[i]] = qual.plot(qualp.WO, qualVarNames.WO[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
VARIABLES WE HAVE HYPOTHESES ABOUT - SIMPLE CATEGORIES
qualp = qualp_copy
# remap factor levels to human readable values (not all present)
for (i in 1:length(qualVarNames)){
thisVarLabs = labMappings[labMappings$featureName == qualVarNames[i],]
old = thisVarLabs$oldLab
new = thisVarLabs$newLab2
col_i = grep(qualVarNames[i], colnames(qualp))
qualp[,col_i] = mapvalues(qualp[,col_i], from = as.character(old), to = as.character(new))
}
featureCats = levels(droplevels(toInclude$Feature.Class))
Morphology
qualVarNames.morph = intersect(toInclude[toInclude$Feature.Class == "Morphology", "Name"], qualVarNames)
qualp.morph = qualp[, c("complexity.bias", qualVarNames.morph)]
plots = list(NA)
for (i in 1:length(qualVarNames.morph )){
plots[[i]] = qual.plot(qualp.morph, qualVarNames.morph[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Nominal Categories
qualVarNames.NC = intersect(toInclude[toInclude$Feature.Class ==
"Nominal Categories", "Name"], qualVarNames)
qualp.NC = qualp[, c("complexity.bias", qualVarNames.NC)]
plots = list(NA)
for (i in 1:length(qualVarNames.NC)){
plots[[i]] = qual.plot(qualp.NC, qualVarNames.NC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Nominal Syntax
qualVarNames.NS = intersect(toInclude[toInclude$Feature.Class == "Nominal Syntax", "Name"], qualVarNames)
qualp.NS = qualp[, c("complexity.bias", qualVarNames.NS)]
plots = list(NA)
for (i in 1:length(qualVarNames.NS)){
plots[[i]] = qual.plot(qualp.NS, qualVarNames.NS[i], labNames)
}
multiplot(plotlist = plots, cols = 2)
Semantic Lexicon
qualVarNames.SL = intersect(toInclude[toInclude$Feature.Class == "Semantic Lexicon", "Name"], qualVarNames)
qualp.SL = qualp[, c("complexity.bias", qualVarNames.SL)]
plots = list(NA)
for (i in 1:length(qualVarNames.SL)){
plots[[i]] = qual.plot(qualp.SL, qualVarNames.SL[i], labNames)
}
multiplot(plotlist = plots, cols = 2)