Social data from Lupyan and Dale (2010)
Read in Lupyan and Dale (2010) and Complexity Data
### L&D data
d = read.table("data/LD_plos_data.txt", fill = T, header = T, sep = "\t", na.strings = "*")
# fix language labels
d$ethnologue_lang_name = tolower(d$ethnologue_lang_name)
d$ethnologue_lang_name[d$ethnologue_lang_name == "standard german"] <- "german"
d$ethnologue_lang_name[d$ethnologue_lang_name == "tosk albanian"] <- "albanian"
d$ethnologue_lang_name[d$ethnologue_lang_name == "catalan-valencian-balear"] <- "catalan"
d$ethnologue_lang_name[d$ethnologue_lang_name == "haitian creole french"] <- "haitian.creole"
d$ethnologue_lang_name[d$ethnologue_lang_name == "irish gaelic"] <- "irish"
d$ethnologue_lang_name[d$ethnologue_lang_name == "central khmer"] <- "khmer"
# Need to collapse across different dialects of same language (e.g. "eastern mongolian" and "peripherial mongolian") [arabic, azerbaijani, chinese, hmong, mongolian, yiddish]
# but first, remove sign languages (e.g."chinese sign language")
d = d[-which(grepl("sign language", d$ethnologue_lang_name)),]
d$eln2 = ifelse(grepl("arabic", d$ethnologue_lang_name), "arabic",
ifelse(grepl("azerbaijani", d$ethnologue_lang_name), "azerbaijani",
ifelse(grepl("chinese", d$ethnologue_lang_name), "chinese",
ifelse(grepl("hmong", d$ethnologue_lang_name), "hmong",
ifelse(grepl("mongolian", d$ethnologue_lang_name), "mongolian",
ifelse(grepl("yiddish", d$ethnologue_lang_name), "yiddish",
d$ethnologue_lang_name))))))
d$eln2= as.factor(d$eln2)
# 12 langs not in pops [belarusian, bosnian, cebuana, croatian, esperanto, filipino, kanada, latin, norwegian, persian, punjabi, serbian]
### Complexity data
c_l = read.csv("data/xling_cors.csv")
names(c_l)[which(names(c_l) == "language")] = "eln2"
names(c_l)[which(names(c_l) == "corr")] = "complexity.bias"
names(c_l)[which(names(c_l) == "p.corr")] = "p.complexity.bias"
names(c_l)[which(names(c_l) == "mono.cor")] = "mono.complexity.bias"
names(c_l)[which(names(c_l) == "open.cor")] = "open.complexity.bias"
Merge LD demographic/geographic variables (quantitative) with complexity data
# aggregate across countries to get quantitative measures by language
d_copy = d
demo = d %>%
group_by(eln2) %>%
summarise_each(funs(mean(., na.rm = TRUE)), c(8:9, 16:121))
demo = d_copy %>%
group_by(eln2) %>%
filter(row_number() == 1) %>%
select(eln2, langFamily, langGenus) %>%
left_join(demo, by = "eln2") # add in language family
# merge with complexity data
clp = c_l %>%
left_join(demo, by = "eln2") %>%
mutate(log.max_lang_population = log(max_lang_population),
log.area = log(area),
log.perimeter = log(perimeter),
log.numNeighbors = log(numNeighbors),
log.numStations = log(numStations),
log.sdTemp = log(sdTemp)) %>%
select(-max_lang_population, -area, -langCountryPop,
-perimeter, -numNeighbors, -numStations,
-sdTemp, -drycas2, -logpop2, -max_lang_populationFlooredAt50) %>%
filter(eln2 != "english") # exlude english
# A note on population variables:
# for our set of 80 langaues, max_lang_population == max_lang_populationFlooredAt50 and logpop == logpop2.
# the logpop variables are not the same as log.max_lang_population, not sure where these variables come from, maybe mean population? (rather than sum)
# remove infs
clp = do.call(data.frame,lapply(clp, function(x)
replace(x, is.infinite(x),NA)))
Social variables by region
There is a negative correlation between complexity bias and population size – languages with bigger populations have smaller complexity biases
There is a reliable relationship between population size and complexity bias, for all measures of complexity bias (partialling on frequency, open class only, and monomorphemic only.)
summary(lm(complexity.bias ~ log.max_lang_population + langFamily, clp))
##
## Call:
## lm(formula = complexity.bias ~ log.max_lang_population + langFamily,
## data = clp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.301832 -0.029059 0.001661 0.035190 0.166064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.545201 0.106152 5.136 4.64e-06 ***
## log.max_lang_population -0.015623 0.006174 -2.530 0.0146 *
## langFamilyAltaic -0.010266 0.065835 -0.156 0.8767
## langFamilyAustro-Asiatic 0.055796 0.075731 0.737 0.4647
## langFamilyAustronesian -0.044907 0.060396 -0.744 0.4606
## langFamilyBasque 0.078590 0.099914 0.787 0.4352
## langFamilyCreoles and Pidgins 0.032201 0.098594 0.327 0.7453
## langFamilyDravidian -0.094893 0.076434 -1.242 0.2202
## langFamilyHmong-Mien 0.007494 0.099492 0.075 0.9403
## langFamilyIndo-European 0.084601 0.043075 1.964 0.0551 .
## langFamilyJapanese -0.146588 0.100006 -1.466 0.1490
## langFamilyKartvelian 0.084861 0.098677 0.860 0.3939
## langFamilyKorean -0.087720 0.099450 -0.882 0.3820
## langFamilyNiger-Congo -0.077103 0.060380 -1.277 0.2075
## langFamilySino-Tibetan 0.109686 0.100768 1.089 0.2816
## langFamilyTai-Kadai 0.056016 0.075301 0.744 0.4604
## langFamilyUralic 0.065957 0.065849 1.002 0.3213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09 on 50 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.4655, Adjusted R-squared: 0.2944
## F-statistic: 2.721 on 16 and 50 DF, p-value: 0.00355
summary(lm(p.complexity.bias ~ log.max_lang_population + langFamily, clp))
##
## Call:
## lm(formula = p.complexity.bias ~ log.max_lang_population + langFamily,
## data = clp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.288621 -0.033350 0.002398 0.040852 0.156857
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.425832 0.107647 3.956 0.000241 ***
## log.max_lang_population -0.015144 0.006261 -2.419 0.019262 *
## langFamilyAltaic -0.007561 0.066762 -0.113 0.910286
## langFamilyAustro-Asiatic 0.057632 0.076797 0.750 0.456508
## langFamilyAustronesian -0.043116 0.061247 -0.704 0.484719
## langFamilyBasque 0.058158 0.101321 0.574 0.568546
## langFamilyCreoles and Pidgins 0.049154 0.099982 0.492 0.625132
## langFamilyDravidian -0.147366 0.077510 -1.901 0.063040 .
## langFamilyHmong-Mien 0.047329 0.100893 0.469 0.641041
## langFamilyIndo-European 0.068405 0.043682 1.566 0.123659
## langFamilyJapanese -0.148082 0.101414 -1.460 0.150499
## langFamilyKartvelian 0.101686 0.100066 1.016 0.314432
## langFamilyKorean -0.099979 0.100851 -0.991 0.326287
## langFamilyNiger-Congo -0.079370 0.061231 -1.296 0.200841
## langFamilySino-Tibetan 0.099405 0.102187 0.973 0.335347
## langFamilyTai-Kadai 0.086896 0.076361 1.138 0.260561
## langFamilyUralic 0.043676 0.066776 0.654 0.516070
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09127 on 50 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.4558, Adjusted R-squared: 0.2816
## F-statistic: 2.617 on 16 and 50 DF, p-value: 0.004871
summary(lm(mono.complexity.bias ~ log.max_lang_population + langFamily, clp))
##
## Call:
## lm(formula = mono.complexity.bias ~ log.max_lang_population +
## langFamily, data = clp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18599 -0.03479 0.00000 0.03175 0.13948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.477098 0.071355 6.686 1.86e-08 ***
## log.max_lang_population -0.015181 0.004150 -3.658 0.000612 ***
## langFamilyAltaic -0.060554 0.044254 -1.368 0.177328
## langFamilyAustro-Asiatic 0.080700 0.050906 1.585 0.119211
## langFamilyAustronesian -0.058475 0.040598 -1.440 0.156002
## langFamilyBasque 0.007693 0.067162 0.115 0.909260
## langFamilyCreoles and Pidgins -0.023025 0.066274 -0.347 0.729738
## langFamilyDravidian -0.081039 0.051379 -1.577 0.121037
## langFamilyHmong-Mien -0.004403 0.066879 -0.066 0.947768
## langFamilyIndo-European 0.020712 0.028955 0.715 0.477737
## langFamilyJapanese -0.078451 0.067224 -1.167 0.248744
## langFamilyKartvelian 0.037913 0.066330 0.572 0.570168
## langFamilyKorean -0.063652 0.066850 -0.952 0.345596
## langFamilyNiger-Congo -0.076147 0.040588 -1.876 0.066481 .
## langFamilySino-Tibetan 0.128130 0.067736 1.892 0.064341 .
## langFamilyTai-Kadai 0.011196 0.050617 0.221 0.825847
## langFamilyUralic -0.010121 0.044264 -0.229 0.820074
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0605 on 50 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.4867, Adjusted R-squared: 0.3225
## F-statistic: 2.964 on 16 and 50 DF, p-value: 0.001705
summary(lm(open.complexity.bias ~ log.max_lang_population + langFamily, clp))
##
## Call:
## lm(formula = open.complexity.bias ~ log.max_lang_population +
## langFamily, data = clp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.299734 -0.038403 0.006584 0.042488 0.169416
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.520534 0.110255 4.721 1.93e-05 ***
## log.max_lang_population -0.016303 0.006413 -2.542 0.0142 *
## langFamilyAltaic -0.011382 0.068380 -0.166 0.8685
## langFamilyAustro-Asiatic 0.047753 0.078658 0.607 0.5465
## langFamilyAustronesian -0.053074 0.062730 -0.846 0.4015
## langFamilyBasque 0.096455 0.103776 0.929 0.3571
## langFamilyCreoles and Pidgins 0.030370 0.102404 0.297 0.7680
## langFamilyDravidian -0.099453 0.079388 -1.253 0.2161
## langFamilyHmong-Mien 0.013168 0.103338 0.127 0.8991
## langFamilyIndo-European 0.074974 0.044740 1.676 0.1000
## langFamilyJapanese -0.165232 0.103871 -1.591 0.1180
## langFamilyKartvelian 0.104568 0.102491 1.020 0.3125
## langFamilyKorean -0.095907 0.103294 -0.928 0.3576
## langFamilyNiger-Congo -0.090317 0.062714 -1.440 0.1561
## langFamilySino-Tibetan 0.106657 0.104662 1.019 0.3131
## langFamilyTai-Kadai 0.052241 0.078211 0.668 0.5072
## langFamilyUralic 0.064620 0.068394 0.945 0.3493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09348 on 50 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.4578, Adjusted R-squared: 0.2843
## F-statistic: 2.638 on 16 and 50 DF, p-value: 0.004564
Relationship between population and complexity holds, controlling for the degree to which a language is isolating vs. concatenating (are there other things that would be intersting to contorl for?)
clp$BICFUS = as.factor(clp$BICFUS)
clp$BICFUS = mapvalues(clp$BICFUS, from = c("1","2","4","6","7"),
to = c("concatenating", "isolating" ,
"isolating", "concatenating", "both"))
summary(lm(complexity.bias ~ log.max_lang_population + langFamily + BICFUS, clp[clp$BICFUS != "NaN",]))
##
## Call:
## lm(formula = complexity.bias ~ log.max_lang_population + langFamily +
## BICFUS, data = clp[clp$BICFUS != "NaN", ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11073 -0.01346 0.00000 0.02641 0.06623
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.59756 0.25126 2.378 0.0413 *
## log.max_lang_population -0.02065 0.01510 -1.368 0.2046
## langFamilyAltaic 0.01244 0.06198 0.201 0.8454
## langFamilyAustro-Asiatic 0.07231 0.08384 0.862 0.4108
## langFamilyAustronesian 0.01869 0.08400 0.222 0.8289
## langFamilyBasque 0.09306 0.09030 1.031 0.3297
## langFamilyHmong-Mien -0.01312 0.10136 -0.129 0.8998
## langFamilyIndo-European 0.17354 0.05744 3.022 0.0144 *
## langFamilyJapanese -0.10526 0.08396 -1.254 0.2415
## langFamilyKartvelian 0.10920 0.07898 1.382 0.2002
## langFamilyKorean -0.04943 0.08077 -0.612 0.5557
## langFamilyNiger-Congo -0.03582 0.05646 -0.634 0.5416
## langFamilySino-Tibetan 0.33483 0.17169 1.950 0.0829 .
## langFamilyTai-Kadai 0.29896 0.14842 2.014 0.0748 .
## langFamilyUralic 0.09041 0.06233 1.450 0.1809
## BICFUSisolating 0.03741 0.05862 0.638 0.5393
## BICFUSboth -0.18050 0.13117 -1.376 0.2021
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0647 on 9 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.8135, Adjusted R-squared: 0.482
## F-statistic: 2.454 on 16 and 9 DF, p-value: 0.08711