Social data from Lupyan and Dale (2010) DL data from Futrell, et al (2015)
Read in Lupyan and Dale (2010) and Complexity Data
### L&D data
d = read.table("data/LD_plos_data.txt", fill = T, header = T, sep = "\t", na.strings = "*")
# fix language labels
d$ethnologue_lang_name = tolower(d$ethnologue_lang_name)
d$ethnologue_lang_name[d$ethnologue_lang_name == "standard german"] <- "german"
d$ethnologue_lang_name[d$ethnologue_lang_name == "tosk albanian"] <- "albanian"
d$ethnologue_lang_name[d$ethnologue_lang_name == "catalan-valencian-balear"] <- "catalan"
d$ethnologue_lang_name[d$ethnologue_lang_name == "haitian creole french"] <- "haitian.creole"
d$ethnologue_lang_name[d$ethnologue_lang_name == "irish gaelic"] <- "irish"
d$ethnologue_lang_name[d$ethnologue_lang_name == "central khmer"] <- "khmer"
# Need to collapse across different dialects of same language (e.g. "eastern mongolian" and "peripherial mongolian") [arabic, azerbaijani, chinese, hmong, mongolian, yiddish]
# but first, remove sign languages (e.g."chinese sign language")
d = d[-which(grepl("sign language", d$ethnologue_lang_name)),]
d$eln2 = ifelse(grepl("arabic", d$ethnologue_lang_name), "arabic",
ifelse(grepl("azerbaijani", d$ethnologue_lang_name), "azerbaijani",
ifelse(grepl("chinese", d$ethnologue_lang_name), "chinese",
ifelse(grepl("hmong", d$ethnologue_lang_name), "hmong",
ifelse(grepl("mongolian", d$ethnologue_lang_name), "mongolian",
ifelse(grepl("yiddish", d$ethnologue_lang_name), "yiddish",
d$ethnologue_lang_name))))))
d$eln2= as.factor(d$eln2)
# 12 langs not in pops [belarusian, bosnian, cebuana, croatian, esperanto, filipino, kanada, latin, norwegian, persian, punjabi, serbian]
### Complexity data
c_l = read.csv("data/xling_cors.csv")
names(c_l)[which(names(c_l) == "language")] = "eln2"
names(c_l)[which(names(c_l) == "corr")] = "complexity.bias"
names(c_l)[which(names(c_l) == "p.corr")] = "p.complexity.bias"
names(c_l)[which(names(c_l) == "mono.cor")] = "mono.complexity.bias"
names(c_l)[which(names(c_l) == "open.cor")] = "open.complexity.bias"
### dependency data data
d_l = read.csv("data/mean_dependency_dif.csv")
Merge LD demographic/geographic variables (quantitative) and DL data with complexity data
# aggregate across countries to get quantitative measures by language
d_copy = d
demo = d %>%
group_by(eln2) %>%
summarise_each(funs(mean(., na.rm = TRUE)), c(8:9, 16:121))
demo = d_copy %>%
group_by(eln2) %>%
filter(row_number() == 1) %>%
select(eln2, langFamily, langGenus) %>%
left_join(demo, by = "eln2") # add in language family
# merge with complexity data and DL data
clp = c_l %>%
left_join(demo, by = "eln2") %>%
left_join(d_l, by = c("eln2" = "language") ) %>%
mutate(log.max_lang_population = log(max_lang_population),
log.area = log(area),
log.perimeter = log(perimeter),
log.numNeighbors = log(numNeighbors),
log.numStations = log(numStations),
log.sdTemp = log(sdTemp)) %>%
select(-max_lang_population, -area, -langCountryPop,
-perimeter, -numNeighbors, -numStations,
-sdTemp, -drycas2, -logpop2, -max_lang_populationFlooredAt50) %>%
filter(mean_dependency_diff != "NA") %>%
filter(eln2 != "english")
# A note on population variables:
# for our set of 80 langaues, max_lang_population == max_lang_populationFlooredAt50 and logpop == logpop2.
# the logpop variables are not the same as log.max_lang_population, not sure where these variables come from, maybe mean population? (rather than sum)
# remove infs
clp = do.call(data.frame,lapply(clp, function(x)
replace(x, is.infinite(x),NA)))
clp.crit = clp %>%
select(c(5,14,15, 107:123) )
corrplot(cor(clp.crit, use = "complete"))
ggplot(clp, aes(slope_dif,
complexity.bias, label = eln2)) +
#geom_text(aes(colour = langFamily)) +
geom_text() +
geom_smooth(method = "lm") +
xlab("slope difference between fixed\n random baseline and observed,\n predicting DL as a function of sentence length (Fig. 4)") +
theme_classic() +
ggtitle("Complexity Bias") +
theme(legend.position="none") +
annotate("text", x = .005, y = .6, color = "red", size = 6,
label = paste("r=",
round(cor(clp$complexity.bias,
clp$slope_dif,
use = "complete"), 2)))
ggplot(clp, aes(slope_dif,
log.numNeighbors, label = eln2)) +
#geom_text(aes(colour = langFamily)) +
geom_text() +
geom_smooth(method = "lm") +
xlab("slope difference between fixed\n random baseline and observed,\n predicting DL as a function of sentence length (Fig. 4)") +
ggtitle("Number of neighbors") +
theme_classic() +
theme(legend.position="none") +
annotate("text", x = .005, y = 1.6, color = "red", size = 6,
label = paste("r=",
round(cor(clp$log.numNeighbors,
clp$slope_dif,
use = "complete"), 2)))