Load data
files = dir("../../production-results/")
d = data.frame()
for (i in 1:length(files)[1]) {
s <- as.data.frame(fromJSON(paste("../../production-results/", files[i], sep = "")))
d = rbind(d, s)
}
# clean up names
names(d) = unlist(strsplit(names(d), "rs."))[unlist(strsplit(names(d), "rs."))
!= "answe"]
Munge.
d_all = d %>%
gather(variable, value, contains("_")) %>%
mutate(trial_num = unlist(lapply(strsplit(as.character(variable),
"_"),function(x) x[2])),
variable = unlist(lapply(strsplit(as.character(variable),
"_"),function(x) x[1]))) %>%
spread(variable, value) %>%
mutate(value = as.numeric(value))
d_all$wordLength = nchar(d_all$word)
d_sw = filter(d_all, condition == "swadesh") #Split datasets by study.
There are 197 participants in this study.
d_sw %>%
filter(word == "motherboard"| word == "ball") %>%
group_by(word) %>%
multi_boot_standard(column = "value", na.rm = T) %>%
ggplot(aes(y = mean, x = word)) +
geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
ylab("mean complexity")+
ylim(1,7) +
theme_bw(base_size = 18)
d_sw = d_sw %>%
filter(word != "motherboard") %>%
filter(word != "ball")
d_sw %>%
ggplot(aes(x=value)) +
geom_density(fill = "red") +
geom_vline(aes(xintercept = mean(value))) +
xlim(1,7) +
theme_bw()
sw.means = d_sw %>%
mutate(word = trimws(word)) %>%
group_by(word) %>%
multi_boot_standard(column = "value") %>%
mutate(wordLength.eng = nchar(word))
ggplot(sw.means, aes(y = wordLength.eng, x = mean )) +
geom_label(aes(label = word),position = "jitter") +
geom_smooth(method = "lm") +
xlim(1,7) +
ylab("word length (char)") +
xlab("Mean complexity rating") +
theme_bw(base_size = 18)
ggplot(sw.means, aes(x = wordLength.eng, y = mean )) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), position = "jitter") +
ylim(1,7) +
xlab("word length (char)") +
ylab("Mean complexity rating") +
theme_bw(base_size = 18) +
coord_flip()
Word length is correlated with complexity in English.
tidy(cor.test(sw.means$wordLength.eng, sw.means$mean))
## estimate statistic p.value parameter conf.low conf.high
## 1 0.3168701 2.059443 0.04635121 38 0.0059478 0.5719258
Merge with all translations
# note- this is asjp coding system.
# [308 small languages have no ISO, leaving 4421 (TOTAL:4729)]
# load in swadesh translations, and gather
swadesh.raw = read.csv("wichmann_2013.csv") %>%
select(1,1:109) %>%
gather(word,translation,I:name)
# get translation length
swadesh.raw = swadesh.raw %>%
mutate(nchar = unlist(
lapply(
lapply(
strsplit(
gsub("[[:space:]]", "", translation) ,
","),
nchar), mean))) %>%
filter(translation != "")
# join with complexity norms
swadesh.crit = swadesh.raw %>%
filter(is.element(word, sw.means$word)) %>%
left_join(sw.means) %>%
rename(complexity = mean) %>%
filter(ISO != "") %>%
distinct(ISO, word)
# add number of items in each language (max == 40)
swadesh.means = swadesh.crit %>%
left_join(swadesh.crit %>% group_by(ISO)
%>% summarise(n = n()))
total.languages = swadesh.means %>%
distinct(ISO) %>%
summarise(count = n())
complete.languages = swadesh.means %>%
filter(n == 40) %>%
distinct(ISO) %>%
summarise(count = n())
There are 4421 languages in this dataset. 980 of those languages have translations for all 40 words.
Determine cutoff of number of items
MIN_ITEMS = 30
subset.languages = swadesh.means %>%
filter(n > (MIN_ITEMS-1)) %>%
distinct(ISO) %>%
summarise(count = n())
The cutoff is 30 items. This includes 3877 langauges in the analysis.
Get complexity bias in each language.
empirical.corrs = swadesh.means %>%
filter(n > (MIN_ITEMS-1)) %>%
group_by(ISO) %>%
summarise(r.empirical = tidy(cor.test(nchar, complexity))$estimate,
p = tidy(cor.test(nchar, complexity))$p.value,
sig = ifelse(p<.05, "*", ""),
language = tolower(names[1]),
fam = tolower(wls_fam[1]),
lat = lat[1],
lon = lon[1],
pop = pop[1])
#filter(sig == "*") %>%
#left_join(swadesh.crit %>% distinct(ISO) %>% select(ISO, wls_fam)) %>%
#mutate(ISO = reorder(ISO,-r))
Get “max” and “min” complexity bias in each language.
# importantly, these are the limits given the spread of lengths for these words. In principle, the bias could be large/smaller if the variance were different.
get_corr_limits <- function(lang_data) {
if (nrow(lang_data) > 10) {
sorted.nchar = sort(lang_data$nchar)
sorted.complexity = sort(lang_data$complexity)
r.max = tidy(cor.test(sorted.nchar, sorted.complexity))$estimate
r.min = tidy(cor.test(sorted.nchar, rev(sorted.complexity)))$estimate
} else {
r.max = NA
r.min = NA
}
data.frame(r.max = r.max,
r.min = r.min,
row.names = NULL)
}
limits.corrs <- swadesh.means %>%
filter(n > (MIN_ITEMS-1)) %>%
split(.$ISO) %>%
map(function(lang_data) get_corr_limits(lang_data)) %>%
bind_rows(.id = "ISO") %>%
filter(ISO != "")
Bootstrap baseline for each language.
# for each langauge, shuffle words, complexity and get r x nboot
# take mean
# look at distribution over mean rs
NBOOTS <- 100
one_sample <- function(lang_data) {
function(k) {
if (nrow(lang_data) > 10) { # fix this
shuffled.nchar = sample_n(lang_data,nrow(lang_data), replace = F)$nchar
tidy(cor.test(shuffled.nchar, lang_data$complexity))$estimate
} else {
NA
}
}
}
all_samples <- function(lang_data, nboot) {
sample_values <- 1:nboot %>%
map(one_sample(lang_data)) %>%
unlist()
data.frame(mean = mean(sample_values, na.rm = T),
ci_lower = ci_lower(sample_values, na.rm = T),
ci_upper = ci_upper(sample_values, na.rm = T),
row.names = NULL)
}
baseline.mean.corrs <- swadesh.means %>%
filter(n > (MIN_ITEMS-1)) %>%
split(.$ISO) %>%
map(function(lang_data) all_samples(lang_data, NBOOTS)) %>%
bind_rows(.id = "ISO") %>%
filter(!is.nan(mean)) %>%
rename(r.baseline = mean)
all.corrs = left_join(empirical.corrs, baseline.mean.corrs) %>%
left_join(limits.corrs)
all.corrs %>%
gather("sample", "r", c(2,10,13,14)) %>%
ggplot(aes(r, group = sample, fill = sample)) +
#geom_histogram() +
geom_density(alpha = .4) +
geom_vline(aes(xintercept = 0)) +
geom_vline(aes(xintercept = -1)) +
geom_vline(aes(xintercept = 1)) +
theme_bw()
# paired t-test
t.test(all.corrs$r.baseline, all.corrs$r.empirical, paired = T)
##
## Paired t-test
##
## data: all.corrs$r.baseline and all.corrs$r.empirical
## t = -34.939, df = 3876, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09843406 -0.08797393
## sample estimates:
## mean of the differences
## -0.093204
Should do a non-parametric test here? (e.g. wilcoxin)
all.corrs.fam = all.corrs %>%
group_by(fam) %>%
summarise(r.empirical = mean(r.empirical),
r.baseline = mean(r.baseline),
r.max = mean(r.max),
r.min = mean(r.min),
pop = pop[1])
all.corrs.fam %>%
gather("sample", "r", c(r.empirical, r.baseline, r.max, r.min)) %>%
ggplot(aes(r, group = sample, fill = sample)) +
#geom_histogram() +
geom_density(alpha = .4) +
geom_vline(aes(xintercept = 0)) +
geom_vline(aes(xintercept = -1)) +
geom_vline(aes(xintercept = 1)) +
theme_bw()
# paired t-test
t.test(all.corrs.fam$r.baseline,all.corrs.fam$r.empirical, paired = T)
##
## Paired t-test
##
## data: all.corrs.fam$r.baseline and all.corrs.fam$r.empirical
## t = -5.9015, df = 219, p-value = 1.357e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.08438880 -0.04213518
## sample estimates:
## mean of the differences
## -0.06326199
If anything, it looks like the bias gets bigger with larger populations. This is a different relationship than we found in teh cogsci 2016 paper.
all.corrs.pop = all.corrs %>%
filter(pop > 0) %>%
mutate(log.pop = log(pop)) %>%
filter(log.pop >5)
ggplot(all.corrs.pop, aes(x = log.pop, y = r.empirical)) +
#geom_histogram() +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
cor.test(all.corrs.pop$log.pop, all.corrs.pop$r.empirical, na.rm = T)
##
## Pearson's product-moment correlation
##
## data: all.corrs.pop$log.pop and all.corrs.pop$r.empirical
## t = 5.625, df = 3308, p-value = 2.009e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06347734 0.13097107
## sample estimates:
## cor
## 0.09733612
summary(lmer(r.empirical ~ log.pop + (1|fam), all.corrs.pop))
## Linear mixed model fit by REML ['lmerMod']
## Formula: r.empirical ~ log.pop + (1 | fam)
## Data: all.corrs.pop
##
## REML criterion at convergence: -2872.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4753 -0.6610 0.0300 0.6841 3.1410
##
## Random effects:
## Groups Name Variance Std.Dev.
## fam (Intercept) 0.01005 0.1002
## Residual 0.02314 0.1521
## Number of obs: 3310, groups: fam, 163
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.0818360 0.0143698 5.695
## log.pop -0.0009608 0.0011672 -0.823
##
## Correlation of Fixed Effects:
## (Intr)
## log.pop -0.677
all.corrs.pop = all.corrs.fam %>%
filter(pop > 0) %>%
mutate(log.pop = log(pop)) %>%
filter(log.pop >5)
ggplot(all.corrs.pop, aes(x = log.pop, y = r.empirical)) +
#geom_histogram() +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
cor.test(all.corrs.pop$log.pop,all.corrs.pop$r.empirical, na.rm = T)
##
## Pearson's product-moment correlation
##
## data: all.corrs.pop$log.pop and all.corrs.pop$r.empirical
## t = 2.3647, df = 146, p-value = 0.01936
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03170071 0.34278385
## sample estimates:
## cor
## 0.1920622
MISC
# ggplot(corrs, aes(x = ISO, y= r, fill = wls_fam)) +
# geom_bar(stat= "identity") +
# theme_bw() +
# theme(axis.text.x = element_text(angle = 90, hjust = 1),
# legend.position = "none")