Language codes (from WALS) – WALS, ISO, ascii-name. These are used to merge across datasets.
codes = read.csv("data/language_codes.csv") %>%
select(language, WALS, ISO) %>%
mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
","),function(x) x[1])))
# in cases where there are two ISO codes, takes first
Read in raw WALS data
ld = read.table("data/lupyan_2010.txt", fill = T,
header = T, sep = "\t", na.strings = "*") %>%
left_join(codes, c("walsCode" = "WALS")) %>%
mutate(lang = tolower(lang)) %>%
#left_join(wichmann_ISOS, by = "lang") %>% # use wichmann ISOS for some missing
filter(!is.na(ISO)) %>%
filter(ISO != "")
# rename columns
ld = rename(ld, pop_log = logpop2)
# get means across language
ld.demo = ld %>%
group_by(ISO) %>%
summarise_each(funs(mean(., na.rm = TRUE)),
c(8:9, 16:121)) %>%
select(1:3, 93:95, 100:101, 103:105, 108)
ld.demo = ungroup(ld.demo) %>%
select(pop_log, ISO)
Morphological complexity, hand derived
# get variables to include
qual = ld %>%
select(18:103, 124) %>%
mutate_each(funs(as.factor)) %>%
group_by(ISO) %>%
summarise_each(funs(most.frequent.level)) %>%
mutate_each(funs(as.factor))
ld_feature_names = read.csv("data/lupyan_2010_all_feature_mappings.csv")
qualVarNames = intersect(names(qual), ld_feature_names$WALS.feature.name)
qual = select(qual,which(is.element(names(qual), c("ISO", qualVarNames))))
# remap factor levels to complexity values (0,1)
# note two variables that are reported in the paper (NICPOC[59] and CYSIND[39] are missing in the data)
for (i in 1:length(qualVarNames)){
thisVarLabs = ld_feature_names[ld_feature_names$WALS.feature.name == qualVarNames[i],]
old = thisVarLabs$ld.level.label
if (is.na(old)) {print("NA!!!")}
new = thisVarLabs$ld.complexity
col_i = grep(qualVarNames[i], colnames(qual))
qual[,col_i] = plyr::mapvalues(as.matrix(qual[,col_i]),
from = as.character(old),
to = as.character(new), warn_missing = TRUE)
}
ld.complexity = qual %>%
mutate_each(funs(as.numeric), -ISO) %>%
gather(variable, complexity.level, 2:28) %>%
group_by(ISO) %>%
summarise(morphological.complexity.by.hand = sum(complexity.level))
ld.demo.qual = left_join(ld.demo, ld.complexity)
Morphological complexity from Lupyan and Dale (2010)
ld.c = read.table("data/lupyan_2010_complexity.txt", fill = T,
header = T, sep = "\t", na.strings = "*") %>%
mutate(lang = tolower(lang)) %>%
left_join(codes, c("lang" = "language")) %>%
rename(morphological.complexity.ld = complexity,
morphological.complexity.ld.centered = complexityCentered) %>%
select(morphological.complexity.ld, propPresent, ISO)
ld.c$adjusted.complexity.ld = ld.c$morphological.complexity.ld / ld.c$propPresent
ld.demo.qual = left_join(ld.demo.qual, select(ld.c, adjusted.complexity.ld, ISO ))
Plot
ld.demo.qual %>%
ggplot(aes(x = morphological.complexity.by.hand, y = adjusted.complexity.ld)) +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
cor.test(ld.demo.qual$morphological.complexity.by.hand,
ld.demo.qual$adjusted.complexity.ld)
##
## Pearson's product-moment correlation
##
## data: ld.demo.qual$morphological.complexity.by.hand and ld.demo.qual$adjusted.complexity.ld
## t = 3.191, df = 861, p-value = 0.001469
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04167877 0.17359535
## sample estimates:
## cor
## 0.1081129
ld.demo.qual %>%
mutate(morphological.complexity.factor =
as.factor(morphological.complexity.by.hand)) %>%
group_by(morphological.complexity.factor) %>%
multi_boot(column="pop_log",
summary_groups = c("morphological.complexity.factor"),
statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
mutate(morphological.complexity =
as.numeric(as.character(morphological.complexity.factor))) %>%
ggplot() +
geom_pointrange(aes(x= morphological.complexity.factor,
y = mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(method = "lm",aes(morphological.complexity-5, mean)) +
# theres a weird offset because complexity is a factor?
ylab("Log population") +
xlab("Morphological complexity - by hand") +
ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction - by hand") +
theme_bw()
ld.demo.qual %>%
mutate(adjusted.complexity.ld = floor(adjusted.complexity.ld)) %>%
filter(adjusted.complexity.ld = !is.na(adjusted.complexity.ld)) %>%
group_by(adjusted.complexity.ld) %>%
multi_boot(column="pop_log",
summary_groups = c("adjusted.complexity.ld"),
statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
mutate(adjusted.complexity.ld =
as.numeric(as.character(adjusted.complexity.ld))) %>%
ggplot() +
geom_pointrange(aes(x= adjusted.complexity.ld,
y = mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(method = "lm",aes(adjusted.complexity.ld, mean)) +
# theres a weird offset because complexity is a factor?
ylab("Log population") +
xlab("Morphological complexity - original data") +
ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction - original data") +
theme_bw()