This is a second sample of Swadesh norms where we altered the instructions to not give examples, and removed the initial anchor items. The goal is to create more variances in the norms, relative to RC43-Swadesh.
Load data
files = dir("../production-results/")
d = data.frame()
for (i in 1:length(files)[1]) {
s <- as.data.frame(fromJSON(paste("../production-results/", files[i], sep = "")))
d = rbind(d, s)
}
# clean up names
names(d) = unlist(strsplit(names(d), "rs."))[unlist(strsplit(names(d), "rs."))
!= "answe"]
Munge.
d_all = d %>%
gather(variable, value, contains("_")) %>%
mutate(trial_num = unlist(lapply(strsplit(as.character(variable),
"_"),function(x) x[2])),
variable = unlist(lapply(strsplit(as.character(variable),
"_"),function(x) x[1]))) %>%
spread(variable, value) %>%
mutate(value = as.numeric(value))
d_all$wordLength = nchar(d_all$word)
There are 100 participants in this study.
d_all %>%
ggplot(aes(x=value)) +
geom_density(fill = "red") +
geom_vline(aes(xintercept = mean(value))) +
xlim(1,7) +
theme_bw()
sw.means = d_all %>%
mutate(word = trimws(word)) %>%
group_by(word) %>%
multi_boot_standard(column = "value") %>%
mutate(wordLength.eng = nchar(word))
ggplot(sw.means, aes(y = wordLength.eng, x = mean )) +
geom_label(aes(label = word),position = "jitter") +
geom_smooth(method = "lm") +
xlim(1,7) +
ylab("word length (char)") +
xlab("Mean complexity rating") +
theme_bw(base_size = 18)
ggplot(sw.means, aes(x = wordLength.eng, y = mean )) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), position = "jitter") +
ylim(1,7) +
xlab("word length (char)") +
ylab("Mean complexity rating") +
theme_bw(base_size = 18) +
coord_flip()
Word length is correlated with complexity in English.
tidy(cor.test(sw.means$wordLength.eng, sw.means$mean))
## estimate statistic p.value parameter conf.low conf.high
## 1 0.5122684 3.676922 0.0007269332 38 0.2388784 0.7104126
Merge with all translations
# note- this is asjp coding system.
# [308 small languages have no ISO, leaving 4421 (TOTAL:4729)]
# load in swadesh translations, and gather
swadesh.raw = read.csv("wichmann_2013.csv") %>%
select(1,1:109) %>%
gather(word,translation,I:name)
# get translation length
swadesh.raw = swadesh.raw %>%
mutate(nchar = unlist(
lapply(
lapply(
strsplit(
gsub("[[:space:]]", "", translation) ,
","),
nchar), mean))) %>%
filter(translation != "")
# join with complexity norms
swadesh.crit = swadesh.raw %>%
filter(is.element(word, sw.means$word)) %>%
left_join(sw.means) %>%
rename(complexity = mean) %>%
filter(ISO != "") %>%
distinct(ISO, word)
# add number of items in each language (max == 40)
swadesh.means = swadesh.crit %>%
left_join(swadesh.crit %>% group_by(ISO)
%>% summarise(n = n()))
total.languages = swadesh.means %>%
distinct(ISO) %>%
summarise(count = n())
complete.languages = swadesh.means %>%
filter(n == 40) %>%
distinct(ISO) %>%
summarise(count = n())
There are 4421 languages in this dataset. 980 of those languages have translations for all 40 words.
Determine cutoff of number of items
MIN_ITEMS = 30
subset.languages = swadesh.means %>%
filter(n > (MIN_ITEMS-1)) %>%
distinct(ISO) %>%
summarise(count = n())
The cutoff is 30 items. This includes 3877 langauges in the analysis.
Get complexity bias in each language.
empirical.corrs = swadesh.means %>%
filter(n > (MIN_ITEMS-1)) %>%
group_by(ISO) %>%
summarise(r.empirical = tidy(cor.test(nchar, complexity))$estimate,
p = tidy(cor.test(nchar, complexity))$p.value,
lci.swa = tidy(cor.test(nchar, complexity))$conf.low,
hci.swa = tidy(cor.test(nchar, complexity))$conf.high,
sig = ifelse(p<.05, "*", ""),
language = tolower(names[1]),
fam = tolower(wls_fam[1]),
lat = lat[1],
lon = lon[1],
pop = pop[1])
#filter(sig == "*") %>%
#left_join(swadesh.crit %>% distinct(ISO) %>% select(ISO, wls_fam)) %>%
#mutate(ISO = reorder(ISO,-r))
If anything, it looks like the bias gets bigger with larger populations. This is a different relationship than we found in the cogsci 2016 paper.
all.corrs <- empirical.corrs
all.corrs.pop = all.corrs %>%
filter(pop > 0) %>%
mutate(log.pop = log(pop)) %>%
filter(log.pop >5)
ggplot(all.corrs.pop, aes(x = log.pop, y = r.empirical)) +
#geom_histogram() +
geom_point() +
geom_smooth(method = "lm") +
theme_bw()
cor.test(all.corrs.pop$log.pop, all.corrs.pop$r.empirical, na.rm = T)
##
## Pearson's product-moment correlation
##
## data: all.corrs.pop$log.pop and all.corrs.pop$r.empirical
## t = 9.3134, df = 3308, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1264671 0.1928666
## sample estimates:
## cor
## 0.1598477
summary(lmer(r.empirical ~ log.pop + (1|fam), all.corrs.pop))
## Linear mixed model fit by REML ['lmerMod']
## Formula: r.empirical ~ log.pop + (1 | fam)
## Data: all.corrs.pop
##
## REML criterion at convergence: -2611.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6634 -0.6777 0.0040 0.6792 3.4021
##
## Random effects:
## Groups Name Variance Std.Dev.
## fam (Intercept) 0.007919 0.08899
## Residual 0.025241 0.15887
## Number of obs: 3310, groups: fam, 163
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.031135 0.014239 2.187
## log.pop 0.002982 0.001212 2.459
##
## Correlation of Fixed Effects:
## (Intr)
## log.pop -0.715
google.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/langLearnVar/data/lewis_2015.csv') %>%
rename(corr.g = corr,
lci.g = lower.ci,
hci.g = upper.ci)
aoa.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Experiment_43/analysis/RC43_CDI/data/aoa_corrs.csv')
swadesh.cb = empirical.corrs %>%
rename(corr.swa = r.empirical)
all.cb = left_join(google.cb, swadesh.cb, by = "language") %>%
left_join(aoa.cb, by = "language") %>%
select(corr.g, hci.g, lci.g, corr.aoa,
hci.aoa, lci.aoa, corr.swa, hci.swa, lci.swa, language)
ggplot(all.cb,aes(x = corr.g,y = corr.swa)) +
geom_point() +
geom_smooth(method = "lm") +
#geom_errorbar(aes(ymin = lci.swa,ymax = hci.swa)) +
#geom_errorbarh(aes(xmin = lci.g,xmax = hci.g)) +
#geom_label(aes(label = language), size = 3) +
ylim(-.2, .6) +
xlim(-.2, .6) +
theme_bw(base_size = 15) +
ggtitle("Cross-sample correlations of CB") +
xlab("Google Translate CB Correlation") +
ylab("Swadesh CB Correlation")
cor.test(all.cb$corr.g, all.cb$corr.swa, na.rm = T)
##
## Pearson's product-moment correlation
##
## data: all.cb$corr.g and all.cb$corr.swa
## t = 2.8563, df = 50, p-value = 0.006228
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1132077 0.5873985
## sample estimates:
## cor
## 0.3745369