Complexity bias and social features analysis

Social data from Lupyan and Dale (2010)

Read in Lupyan and Dale (2010) and Complexity Data

### L&D data
d = read.table("data/LD_plos_data.txt", fill = T, header = T, sep = "\t", na.strings = "*")

# fix language labels
d$ethnologue_lang_name = tolower(d$ethnologue_lang_name)
d$ethnologue_lang_name[d$ethnologue_lang_name == "standard german"] <- "german"
d$ethnologue_lang_name[d$ethnologue_lang_name == "tosk albanian"] <- "albanian"
d$ethnologue_lang_name[d$ethnologue_lang_name == "catalan-valencian-balear"] <- "catalan"
d$ethnologue_lang_name[d$ethnologue_lang_name == "haitian creole french"] <- "haitian.creole"
d$ethnologue_lang_name[d$ethnologue_lang_name == "irish gaelic"] <- "irish"
d$ethnologue_lang_name[d$ethnologue_lang_name == "central khmer"] <- "khmer"

# Need to collapse across different dialects of same language (e.g. "eastern mongolian" and "peripherial mongolian") [arabic, azerbaijani, chinese, hmong, mongolian, yiddish]

# but first, remove sign languages (e.g."chinese sign language")
d = d[-which(grepl("sign language", d$ethnologue_lang_name)),]

d$eln2 = ifelse(grepl("arabic", d$ethnologue_lang_name), "arabic",
                   ifelse(grepl("azerbaijani", d$ethnologue_lang_name), "azerbaijani",
                          ifelse(grepl("chinese", d$ethnologue_lang_name), "chinese",
                                 ifelse(grepl("hmong", d$ethnologue_lang_name), "hmong",
                                        ifelse(grepl("mongolian", d$ethnologue_lang_name), "mongolian",
                                               ifelse(grepl("yiddish", d$ethnologue_lang_name), "yiddish",
                                                      d$ethnologue_lang_name))))))
d$eln2= as.factor(d$eln2)

# 12 langs not in pops [belarusian, bosnian, cebuana, croatian, esperanto, filipino, kanada, latin, norwegian, persian, punjabi, serbian]

### Complexity data
c_l = read.csv("data/xling_cors.csv")
names(c_l)[which(names(c_l) == "language")] = "eln2"
names(c_l)[which(names(c_l) == "corr")] = "complexity.bias"
names(c_l)[which(names(c_l) == "p.corr")] = "p.complexity.bias"
names(c_l)[which(names(c_l) == "mono.cor")] = "mono.complexity.bias"
names(c_l)[which(names(c_l) == "open.cor")] = "open.complexity.bias"

Merge LD demographic/geographic variables (quantitative) with complexity data

# aggregate across countries to get quantitative measures by language
d_copy = d
demo = d %>%
     group_by(eln2) %>%
     summarise_each(funs(mean(., na.rm = TRUE)), c(8:9, 16:121))

demo = d_copy %>%
      group_by(eln2) %>%
      filter(row_number() == 1) %>%
      select(eln2, langFamily, langGenus)  %>%
      left_join(demo, by = "eln2") # add in language family

# merge with complexity data
clp = c_l %>%
      left_join(demo, by = "eln2") %>%
      mutate(log.max_lang_population = log(max_lang_population),
             log.area = log(area),
             log.perimeter = log(perimeter),
             log.numNeighbors = log(numNeighbors),
             log.numStations = log(numStations),
             log.sdTemp = log(sdTemp)) %>%
      select(-max_lang_population, -area, -langCountryPop,
             -perimeter, -numNeighbors, -numStations,
             -sdTemp, -drycas2, -logpop2, -max_lang_populationFlooredAt50) 

# A note on population variables:
# for our set of 80 langaues, max_lang_population == max_lang_populationFlooredAt50 and logpop == logpop2. 
# the logpop variables are not the same as log.max_lang_population, not sure where these variables come from, maybe mean population? (rather than sum)

# remove infs
clp = do.call(data.frame,lapply(clp, function(x) 
            replace(x, is.infinite(x),NA)))

Plot complexity bias against all demographic/geographic variables

con.plot <- function(df, x) {
  
  # plot setup
  df$x <- df[, x]
  ypos = min(df$x, na.rm = T) + 
    (max(df$x, na.rm = T)-min(df$x, na.rm = T))*.75
  
  # get corr significance
  sig = NA
  sig[1] = cor.test(df$complexity.bias, 
                 df$x, use = "complete")$p.value < .05
  sig[2] = cor.test(df$p.complexity.bias, 
                 df$x, use = "complete")$p.value < .05
  sig[3] = cor.test(df$mono.complexity.bias, 
                 df$x, use = "complete")$p.value < .05
  sig[4] = cor.test(df$open.complexity.bias, 
                 df$x, use = "complete")$p.value < .05
  sig_char = ""
  if (sig[1]) {
    sig_char = paste(sig_char, "*", sep = "")
  } 
  if (sig[2]) {
    sig_char = paste(sig_char, "f", sep = "")
  } 
  if (sig[3]) {
    sig_char = paste(sig_char, "m", sep = "")
  } 
  if (sig[4]) {
    sig_char = paste(sig_char, "o", sep = "")
  } 
  
  # plot
  ggplot(df, aes(y = complexity.bias, x = x)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  ylab("CB") +
  xlab(x) +
  annotate("text", x = ypos, y = .6, color = "red", size = 4,
              label = paste("r=",
                          round(cor(df$complexity.bias, 
                                         df$x, 
                                         use = "complete"), 2),
                          sig_char,
                          sep = "")) +
  themeML
}

clp.plot = clp[,c(-1:-4, -6, -10:-12, -15:-101)]
demoVarNames = names(clp.plot)[-1:-4]  # remove complexity.bias corrs

# make plots
plots=list(NA)
for (i in 1:length(demoVarNames)){
  plots[[i]]  = con.plot(clp.plot, demoVarNames[i])
}

#quartz()
multiplot(plotlist = plots, cols = 3)

Merge morphological L&D variables (qualitative) with complexity data

# aggregate across countries to get quantitative measures by language
most.frequent.level = function (x){
    mf = names(which.max(table(x)))
    return(mf)
}

# make morphological features factors
d[,18:104] <- colwise(as.factor)(d[,18:104])

# aggregate L&D data by language
qual = d %>%
  select(-1:-3, -5:-9, -13, -15:-17, -104:-122) %>%
  group_by(eln2) %>%
  summarise_each(funs(most.frequent.level))

# merge in complexity data
qualp = c_l %>%
       left_join(qual, by="eln2") %>%
       select(-1:-4, -6:-9)

# make everything factors
qualp[,2:92] <- colwise(as.factor)(qualp[,2:92])
qualp_copy = qualp

Plot complexity bias against all morphological features

#Plotting function
qual.plot <- function(df, lingFactor, labs) {
    df$lingFactor <- df[,lingFactor]
    ms = df %>%
         filter(!is.na(lingFactor)) %>%
         multi_boot(column = "complexity.bias",
                      summary_groups = lingFactor,
                      #statistics_functions = c("mean"),
                      statistics_functions = c("mean", "ci_lower","ci_upper"))
    
    # make factor levels in ascending order
    ms[[lingFactor]] = as.factor(ms[[lingFactor]])
    ord = sort(ms$mean, index.return = T)$ix
    ms[[lingFactor]] = factor(ms[[lingFactor]], levels(ms[[lingFactor]])[ord])
    
    # get title
    title = labs[labs$Name == lingFactor, "Description"]

    # plot
    ggplot(ms, aes_string(y = "mean", x = lingFactor, fill = lingFactor)) +
    geom_bar(position = "dodge", stat = "identity") +  
    geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), 
                    width = 0.2, position = "dodge") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none") +
    ylab("CB") +
    ggtitle(title) +
    themeML 
}

VARIABLES WE HAVE HYPOTHESES ABOUT

# get rid of vars we don't care about to make more manageable
dontCare = c("complexity.bias", "checked", "continent", 
             "nativeCountry", "nativeCountryArea", "langGenus")
qualVarNames = setdiff(names(qualp), c(dontCare))

# include 
labNames= read.csv("data/feature_names.csv")
toInclude = labNames[labNames$include == 1, ]
qualVarNames = intersect(qualVarNames, toInclude$Name)

# Some vars are missing from L&D data set ["HASIND" "GILDIF" "AUWPRH" "DRYSBV" "DRYOBV" "DRYXOV" "DRYREL" "POLANT" "POLAPP" "SONPER" "SONNON" "NICMTP" "NICNMP" "DAHTEA"]

# remap factor levels to human readable values (not all present)
labMappings = read.csv("data/labMappings.csv")
for (i in 1:length(qualVarNames)){
  thisVarLabs = labMappings[labMappings$featureName == qualVarNames[i],]
  old = thisVarLabs$oldLab
  new = thisVarLabs$newLab
  col_i = grep(qualVarNames[i], colnames(qualp))
  qualp[,col_i] = mapvalues(qualp[,col_i], from = as.character(old), to = as.character(new))
}

featureCats = levels(droplevels(toInclude$Feature.Class))

Morphology

qualVarNames.morph = intersect(toInclude[toInclude$Feature.Class == "Morphology", "Name"], qualVarNames)
qualp.morph = qualp[, c("complexity.bias", qualVarNames.morph)]

plots = list(NA)
for (i in 1:length(qualVarNames.morph )){
  plots[[i]] = qual.plot(qualp.morph, qualVarNames.morph[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

Nominal Categories

qualVarNames.NC = intersect(toInclude[toInclude$Feature.Class == 
                                     "Nominal Categories", "Name"], qualVarNames)
qualp.NC = qualp[, c("complexity.bias", qualVarNames.NC)]

plots = list(NA)
for (i in 1:length(qualVarNames.NC)){
  plots[[i]] = qual.plot(qualp.NC, qualVarNames.NC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

Nominal Syntax

qualVarNames.NS = intersect(toInclude[toInclude$Feature.Class == "Nominal Syntax", "Name"], qualVarNames)
qualp.NS = qualp[, c("complexity.bias", qualVarNames.NS)]

plots = list(NA)
for (i in 1:length(qualVarNames.NS)){
  plots[[i]] = qual.plot(qualp.NS, qualVarNames.NS[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

Semantic Lexicon

qualVarNames.SL = intersect(toInclude[toInclude$Feature.Class == "Semantic Lexicon", "Name"], qualVarNames)
qualp.SL = qualp[, c("complexity.bias", qualVarNames.SL)]

plots = list(NA)
for (i in 1:length(qualVarNames.SL)){
  plots[[i]] = qual.plot(qualp.SL, qualVarNames.SL[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

Simple Clauses

qualVarNames.SC = intersect(toInclude[toInclude$Feature.Class == "Simple Clauses", "Name"], qualVarNames)
qualp.SC = qualp[, c("complexity.bias", qualVarNames.SC)]

plots = list(NA)
for (i in 1:length(qualVarNames.SC)){
  plots[[i]] = qual.plot(qualp.SC, qualVarNames.SC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

Verbal Categories

qualVarNames.VC = intersect(toInclude[toInclude$Feature.Class == "Verbal Categories", "Name"], qualVarNames)
qualp.VC = qualp[, c("complexity.bias", qualVarNames.VC)]

plots = list(NA)
for (i in 1:length(qualVarNames.VC)){
  plots[[i]] = qual.plot(qualp.VC, qualVarNames.VC[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

Word order

qualVarNames.WO = intersect(toInclude[toInclude$Feature.Class == "Word Order", "Name"], qualVarNames)
qualp.WO = qualp[, c("complexity.bias", qualVarNames.WO)]

plots = list(NA)
for (i in 1:length(qualVarNames.WO)){
  plots[[i]] = qual.plot(qualp.WO, qualVarNames.WO[i], labNames)
}
multiplot(plotlist = plots, cols = 2)

VARIABLES WE HAVE HYPOTHESES ABOUT - SIMPLE CATEGORIES

qualp = qualp_copy
# remap factor levels to human readable values (not all present)
for (i in 1:length(qualVarNames)){
  thisVarLabs = labMappings[labMappings$featureName == qualVarNames[i],]
  old = thisVarLabs$oldLab
  new = thisVarLabs$newLab2
  col_i = grep(qualVarNames[i], colnames(qualp))
  qualp[,col_i] = mapvalues(qualp[,col_i], from = as.character(old), to = as.character(new))
}

featureCats = levels(droplevels(toInclude$Feature.Class))