Complexity bias and social features analysis

Social data from Lupyan and Dale (2010)

Read in Lupyan and Dale (2010) and Complexity Data

### L&D data
d = read.table("data/LD_plos_data.txt", fill = T, header = T, sep = "\t", na.strings = "*")

# fix language labels
d$ethnologue_lang_name = tolower(d$ethnologue_lang_name)
d$ethnologue_lang_name[d$ethnologue_lang_name == "standard german"] <- "german"
d$ethnologue_lang_name[d$ethnologue_lang_name == "tosk albanian"] <- "albanian"
d$ethnologue_lang_name[d$ethnologue_lang_name == "catalan-valencian-balear"] <- "catalan"
d$ethnologue_lang_name[d$ethnologue_lang_name == "haitian creole french"] <- "haitian.creole"
d$ethnologue_lang_name[d$ethnologue_lang_name == "irish gaelic"] <- "irish"
d$ethnologue_lang_name[d$ethnologue_lang_name == "central khmer"] <- "khmer"

# Need to collapse across different dialects of same language (e.g. "eastern mongolian" and "peripherial mongolian") [arabic, azerbaijani, chinese, hmong, mongolian, yiddish]

# but first, remove sign languages (e.g."chinese sign language")
d = d[-which(grepl("sign language", d$ethnologue_lang_name)),]

d$eln2 = ifelse(grepl("arabic", d$ethnologue_lang_name), "arabic",
                   ifelse(grepl("azerbaijani", d$ethnologue_lang_name), "azerbaijani",
                          ifelse(grepl("chinese", d$ethnologue_lang_name), "chinese",
                                 ifelse(grepl("hmong", d$ethnologue_lang_name), "hmong",
                                        ifelse(grepl("mongolian", d$ethnologue_lang_name), "mongolian",
                                               ifelse(grepl("yiddish", d$ethnologue_lang_name), "yiddish",
                                                      d$ethnologue_lang_name))))))
d$eln2= as.factor(d$eln2)

# 12 langs not in pops [belarusian, bosnian, cebuana, croatian, esperanto, filipino, kanada, latin, norwegian, persian, punjabi, serbian]

### Complexity data
c_l = read.csv("data/xling_cors.csv")
names(c_l)[which(names(c_l) == "language")] = "eln2"
names(c_l)[which(names(c_l) == "corr")] = "complexity.bias"
names(c_l)[which(names(c_l) == "p.corr")] = "p.complexity.bias"
names(c_l)[which(names(c_l) == "mono.cor")] = "mono.complexity.bias"
names(c_l)[which(names(c_l) == "open.cor")] = "open.complexity.bias"

Merge LD demographic/geographic variables (quantitative) with complexity data

# aggregate across countries to get quantitative measures by language
d_copy = d
demo = d %>%
     group_by(eln2) %>%
     summarise_each(funs(mean(., na.rm = TRUE)), c(8:9, 16:121))

demo = d_copy %>%
      group_by(eln2) %>%
      filter(row_number() == 1) %>%
      select(eln2, langFamily, langGenus)  %>%
      left_join(demo, by = "eln2") # add in language family

# merge with complexity data
clp = c_l %>%
      left_join(demo, by = "eln2") %>%
      mutate(log.max_lang_population = log(max_lang_population),
             log.area = log(area),
             log.perimeter = log(perimeter),
             log.numNeighbors = log(numNeighbors),
             log.numStations = log(numStations),
             log.sdTemp = log(sdTemp)) %>%
      select(-max_lang_population, -area, -langCountryPop,
             -perimeter, -numNeighbors, -numStations,
             -sdTemp, -drycas2, -logpop2, -max_lang_populationFlooredAt50) %>%
      filter(eln2 != "english")  # exlude english


# A note on population variables:
# for our set of 80 langaues, max_lang_population == max_lang_populationFlooredAt50 and logpop == logpop2. 
# the logpop variables are not the same as log.max_lang_population, not sure where these variables come from, maybe mean population? (rather than sum)

# remove infs
clp = do.call(data.frame,lapply(clp, function(x) 
            replace(x, is.infinite(x),NA)))

Social variables by region

There is a negative correlation between complexity bias and population size – languages with bigger populations have smaller complexity biases

There is a reliable relationship between population size and complexity bias, for all measures of complexity bias (partialling on frequency, open class only, and monomorphemic only.)

summary(lm(complexity.bias ~ log.max_lang_population + langFamily, clp))

## 
## Call:
## lm(formula = complexity.bias ~ log.max_lang_population + langFamily, 
##     data = clp)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.301832 -0.029059  0.001661  0.035190  0.166064 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.545201   0.106152   5.136 4.64e-06 ***
## log.max_lang_population       -0.015623   0.006174  -2.530   0.0146 *  
## langFamilyAltaic              -0.010266   0.065835  -0.156   0.8767    
## langFamilyAustro-Asiatic       0.055796   0.075731   0.737   0.4647    
## langFamilyAustronesian        -0.044907   0.060396  -0.744   0.4606    
## langFamilyBasque               0.078590   0.099914   0.787   0.4352    
## langFamilyCreoles and Pidgins  0.032201   0.098594   0.327   0.7453    
## langFamilyDravidian           -0.094893   0.076434  -1.242   0.2202    
## langFamilyHmong-Mien           0.007494   0.099492   0.075   0.9403    
## langFamilyIndo-European        0.084601   0.043075   1.964   0.0551 .  
## langFamilyJapanese            -0.146588   0.100006  -1.466   0.1490    
## langFamilyKartvelian           0.084861   0.098677   0.860   0.3939    
## langFamilyKorean              -0.087720   0.099450  -0.882   0.3820    
## langFamilyNiger-Congo         -0.077103   0.060380  -1.277   0.2075    
## langFamilySino-Tibetan         0.109686   0.100768   1.089   0.2816    
## langFamilyTai-Kadai            0.056016   0.075301   0.744   0.4604    
## langFamilyUralic               0.065957   0.065849   1.002   0.3213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09 on 50 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.4655, Adjusted R-squared:  0.2944 
## F-statistic: 2.721 on 16 and 50 DF,  p-value: 0.00355

summary(lm(p.complexity.bias ~ log.max_lang_population + langFamily, clp))

## 
## Call:
## lm(formula = p.complexity.bias ~ log.max_lang_population + langFamily, 
##     data = clp)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.288621 -0.033350  0.002398  0.040852  0.156857 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.425832   0.107647   3.956 0.000241 ***
## log.max_lang_population       -0.015144   0.006261  -2.419 0.019262 *  
## langFamilyAltaic              -0.007561   0.066762  -0.113 0.910286    
## langFamilyAustro-Asiatic       0.057632   0.076797   0.750 0.456508    
## langFamilyAustronesian        -0.043116   0.061247  -0.704 0.484719    
## langFamilyBasque               0.058158   0.101321   0.574 0.568546    
## langFamilyCreoles and Pidgins  0.049154   0.099982   0.492 0.625132    
## langFamilyDravidian           -0.147366   0.077510  -1.901 0.063040 .  
## langFamilyHmong-Mien           0.047329   0.100893   0.469 0.641041    
## langFamilyIndo-European        0.068405   0.043682   1.566 0.123659    
## langFamilyJapanese            -0.148082   0.101414  -1.460 0.150499    
## langFamilyKartvelian           0.101686   0.100066   1.016 0.314432    
## langFamilyKorean              -0.099979   0.100851  -0.991 0.326287    
## langFamilyNiger-Congo         -0.079370   0.061231  -1.296 0.200841    
## langFamilySino-Tibetan         0.099405   0.102187   0.973 0.335347    
## langFamilyTai-Kadai            0.086896   0.076361   1.138 0.260561    
## langFamilyUralic               0.043676   0.066776   0.654 0.516070    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09127 on 50 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.4558, Adjusted R-squared:  0.2816 
## F-statistic: 2.617 on 16 and 50 DF,  p-value: 0.004871

summary(lm(mono.complexity.bias ~ log.max_lang_population + langFamily, clp))

## 
## Call:
## lm(formula = mono.complexity.bias ~ log.max_lang_population + 
##     langFamily, data = clp)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18599 -0.03479  0.00000  0.03175  0.13948 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.477098   0.071355   6.686 1.86e-08 ***
## log.max_lang_population       -0.015181   0.004150  -3.658 0.000612 ***
## langFamilyAltaic              -0.060554   0.044254  -1.368 0.177328    
## langFamilyAustro-Asiatic       0.080700   0.050906   1.585 0.119211    
## langFamilyAustronesian        -0.058475   0.040598  -1.440 0.156002    
## langFamilyBasque               0.007693   0.067162   0.115 0.909260    
## langFamilyCreoles and Pidgins -0.023025   0.066274  -0.347 0.729738    
## langFamilyDravidian           -0.081039   0.051379  -1.577 0.121037    
## langFamilyHmong-Mien          -0.004403   0.066879  -0.066 0.947768    
## langFamilyIndo-European        0.020712   0.028955   0.715 0.477737    
## langFamilyJapanese            -0.078451   0.067224  -1.167 0.248744    
## langFamilyKartvelian           0.037913   0.066330   0.572 0.570168    
## langFamilyKorean              -0.063652   0.066850  -0.952 0.345596    
## langFamilyNiger-Congo         -0.076147   0.040588  -1.876 0.066481 .  
## langFamilySino-Tibetan         0.128130   0.067736   1.892 0.064341 .  
## langFamilyTai-Kadai            0.011196   0.050617   0.221 0.825847    
## langFamilyUralic              -0.010121   0.044264  -0.229 0.820074    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0605 on 50 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.4867, Adjusted R-squared:  0.3225 
## F-statistic: 2.964 on 16 and 50 DF,  p-value: 0.001705

summary(lm(open.complexity.bias ~ log.max_lang_population + langFamily, clp))

## 
## Call:
## lm(formula = open.complexity.bias ~ log.max_lang_population + 
##     langFamily, data = clp)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.299734 -0.038403  0.006584  0.042488  0.169416 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.520534   0.110255   4.721 1.93e-05 ***
## log.max_lang_population       -0.016303   0.006413  -2.542   0.0142 *  
## langFamilyAltaic              -0.011382   0.068380  -0.166   0.8685    
## langFamilyAustro-Asiatic       0.047753   0.078658   0.607   0.5465    
## langFamilyAustronesian        -0.053074   0.062730  -0.846   0.4015    
## langFamilyBasque               0.096455   0.103776   0.929   0.3571    
## langFamilyCreoles and Pidgins  0.030370   0.102404   0.297   0.7680    
## langFamilyDravidian           -0.099453   0.079388  -1.253   0.2161    
## langFamilyHmong-Mien           0.013168   0.103338   0.127   0.8991    
## langFamilyIndo-European        0.074974   0.044740   1.676   0.1000    
## langFamilyJapanese            -0.165232   0.103871  -1.591   0.1180    
## langFamilyKartvelian           0.104568   0.102491   1.020   0.3125    
## langFamilyKorean              -0.095907   0.103294  -0.928   0.3576    
## langFamilyNiger-Congo         -0.090317   0.062714  -1.440   0.1561    
## langFamilySino-Tibetan         0.106657   0.104662   1.019   0.3131    
## langFamilyTai-Kadai            0.052241   0.078211   0.668   0.5072    
## langFamilyUralic               0.064620   0.068394   0.945   0.3493    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09348 on 50 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.4578, Adjusted R-squared:  0.2843 
## F-statistic: 2.638 on 16 and 50 DF,  p-value: 0.004564

Relationship between population and complexity holds, controlling for the degree to which a language is isolating vs. concatenating (are there other things that would be intersting to contorl for?)

clp$BICFUS = as.factor(clp$BICFUS)
clp$BICFUS = mapvalues(clp$BICFUS, from = c("1","2","4","6","7"), 
                      to = c("concatenating", "isolating" ,
                             "isolating", "concatenating", "both"))
summary(lm(complexity.bias ~ log.max_lang_population + langFamily + BICFUS, clp[clp$BICFUS != "NaN",]))

## 
## Call:
## lm(formula = complexity.bias ~ log.max_lang_population + langFamily + 
##     BICFUS, data = clp[clp$BICFUS != "NaN", ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.11073 -0.01346  0.00000  0.02641  0.06623 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)  
## (Intercept)               0.59756    0.25126   2.378   0.0413 *
## log.max_lang_population  -0.02065    0.01510  -1.368   0.2046  
## langFamilyAltaic          0.01244    0.06198   0.201   0.8454  
## langFamilyAustro-Asiatic  0.07231    0.08384   0.862   0.4108  
## langFamilyAustronesian    0.01869    0.08400   0.222   0.8289  
## langFamilyBasque          0.09306    0.09030   1.031   0.3297  
## langFamilyHmong-Mien     -0.01312    0.10136  -0.129   0.8998  
## langFamilyIndo-European   0.17354    0.05744   3.022   0.0144 *
## langFamilyJapanese       -0.10526    0.08396  -1.254   0.2415  
## langFamilyKartvelian      0.10920    0.07898   1.382   0.2002  
## langFamilyKorean         -0.04943    0.08077  -0.612   0.5557  
## langFamilyNiger-Congo    -0.03582    0.05646  -0.634   0.5416  
## langFamilySino-Tibetan    0.33483    0.17169   1.950   0.0829 .
## langFamilyTai-Kadai       0.29896    0.14842   2.014   0.0748 .
## langFamilyUralic          0.09041    0.06233   1.450   0.1809  
## BICFUSisolating           0.03741    0.05862   0.638   0.5393  
## BICFUSboth               -0.18050    0.13117  -1.376   0.2021  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0647 on 9 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.8135, Adjusted R-squared:  0.482 
## F-statistic: 2.454 on 16 and 9 DF,  p-value: 0.08711