Complexity bias and social features analysis

Social data from Lupyan and Dale (2010)



Read in Lupyan and Dale (2010) and Complexity Data

### L&D data
d = read.table("data/LD_plos_data.txt", fill = T, header = T, sep = "\t", na.strings = "*")

# fix language labels
d$ethnologue_lang_name = tolower(d$ethnologue_lang_name)
d$ethnologue_lang_name[d$ethnologue_lang_name == "standard german"] <- "german"
d$ethnologue_lang_name[d$ethnologue_lang_name == "tosk albanian"] <- "albanian"
d$ethnologue_lang_name[d$ethnologue_lang_name == "catalan-valencian-balear"] <- "catalan"
d$ethnologue_lang_name[d$ethnologue_lang_name == "haitian creole french"] <- "haitian.creole"
d$ethnologue_lang_name[d$ethnologue_lang_name == "irish gaelic"] <- "irish"
d$ethnologue_lang_name[d$ethnologue_lang_name == "central khmer"] <- "khmer"

# Need to collapse across different dialects of same language (e.g. "eastern mongolian" and "peripherial mongolian") [arabic, azerbaijani, chinese, hmong, mongolian, yiddish]

# but first, remove sign languages (e.g."chinese sign language")
d = d[-which(grepl("sign language", d$ethnologue_lang_name)),]

d$eln2 = ifelse(grepl("arabic", d$ethnologue_lang_name), "arabic",
                   ifelse(grepl("azerbaijani", d$ethnologue_lang_name), "azerbaijani",
                          ifelse(grepl("chinese", d$ethnologue_lang_name), "chinese",
                                 ifelse(grepl("hmong", d$ethnologue_lang_name), "hmong",
                                        ifelse(grepl("mongolian", d$ethnologue_lang_name), "mongolian",
                                               ifelse(grepl("yiddish", d$ethnologue_lang_name), "yiddish",
                                                      d$ethnologue_lang_name))))))
d$eln2= as.factor(d$eln2)

# 12 langs not in pops [belarusian, bosnian, cebuana, croatian, esperanto, filipino, kanada, latin, norwegian, persian, punjabi, serbian]

### Complexity data
c_l = read.csv("data/xling_cors.csv")
names(c_l)[which(names(c_l) == "language")] = "eln2"
names(c_l)[which(names(c_l) == "corr")] = "complexity.bias"
names(c_l)[which(names(c_l) == "p.corr")] = "p.complexity.bias"
names(c_l)[which(names(c_l) == "mono.cor")] = "mono.complexity.bias"
names(c_l)[which(names(c_l) == "open.cor")] = "open.complexity.bias"

Merge LD demographic/geographic variables (quantitative) with complexity data

# aggregate across countries to get quantitative measures by language
d_copy = d
demo = d %>%
     group_by(eln2) %>%
     summarise_each(funs(mean(., na.rm = TRUE)), c(8:9, 16:121))

demo = d_copy %>%
      group_by(eln2) %>%
      filter(row_number() == 1) %>%
      select(eln2, langFamily, langGenus)  %>%
      left_join(demo, by = "eln2") # add in language family

# merge with complexity data
clp = c_l %>%
      left_join(demo, by = "eln2") %>%
      mutate(log.max_lang_population = log(max_lang_population),
             log.area = log(area),
             log.perimeter = log(perimeter),
             log.numNeighbors = log(numNeighbors),
             log.numStations = log(numStations),
             log.sdTemp = log(sdTemp)) %>%
      select(-max_lang_population, -area, -langCountryPop,
             -perimeter, -numNeighbors, -numStations,
             -sdTemp, -drycas2, -logpop2, -max_lang_populationFlooredAt50) %>%
      filter(eln2 != "english")  # exlude english


# A note on population variables:
# for our set of 80 langaues, max_lang_population == max_lang_populationFlooredAt50 and logpop == logpop2. 
# the logpop variables are not the same as log.max_lang_population, not sure where these variables come from, maybe mean population? (rather than sum)

# remove infs
clp = do.call(data.frame,lapply(clp, function(x) 
            replace(x, is.infinite(x),NA)))

Social variables by region