RC44 - Swadesh norming set

This is a second sample of Swadesh norms where we altered the instructions to not give examples, and removed the initial anchor items. The goal is to create more variances in the norms, relative to RC43-Swadesh.

Complexity norms

Load data

files = dir("../production-results/")
d = data.frame()
for (i in 1:length(files)[1]) {
    s <- as.data.frame(fromJSON(paste("../production-results/", files[i], sep = "")))
    d = rbind(d, s)
}

# clean up names
names(d) = unlist(strsplit(names(d), "rs."))[unlist(strsplit(names(d), "rs."))
                                             != "answe"]

Munge.

d_all = d %>%
  gather(variable, value, contains("_")) %>%
  mutate(trial_num =  unlist(lapply(strsplit(as.character(variable),
                                      "_"),function(x) x[2])),
         variable = unlist(lapply(strsplit(as.character(variable),
                                      "_"),function(x) x[1]))) %>%
  spread(variable, value) %>%
  mutate(value = as.numeric(value))

d_all$wordLength = nchar(d_all$word)

There are 100 participants in this study.

Response distribution

d_all %>%
    ggplot(aes(x=value)) +
    geom_density(fill = "red") +
    geom_vline(aes(xintercept = mean(value))) +
    xlim(1,7) +
    theme_bw()

Complexity bias in English

sw.means = d_all %>%
  mutate(word = trimws(word)) %>%
  group_by(word) %>%
  multi_boot_standard(column = "value")  %>%
  mutate(wordLength.eng = nchar(word))

ggplot(sw.means, aes(y = wordLength.eng, x = mean )) +
  geom_label(aes(label = word),position = "jitter") +
  geom_smooth(method = "lm") +
  xlim(1,7) +
  ylab("word length (char)") +
  xlab("Mean complexity rating") +
  theme_bw(base_size = 18)

ggplot(sw.means, aes(x = wordLength.eng, y = mean )) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), position = "jitter") +
  ylim(1,7) +
  xlab("word length (char)") +
  ylab("Mean complexity rating") +
  theme_bw(base_size = 18) +
  coord_flip()

Stats.

Word length is correlated with complexity in English.

tidy(cor.test(sw.means$wordLength.eng, sw.means$mean))

##    estimate statistic      p.value parameter  conf.low conf.high
## 1 0.5122684  3.676922 0.0007269332        38 0.2388784 0.7104126

Translations

Merge with all translations

# note- this is asjp coding system.
# [308 small languages have no ISO, leaving 4421 (TOTAL:4729)]

# load in swadesh translations, and gather
swadesh.raw = read.csv("wichmann_2013.csv") %>% 
  select(1,1:109) %>%
  gather(word,translation,I:name) 

# get translation length
swadesh.raw =  swadesh.raw %>%
  mutate(nchar = unlist(
    lapply(
      lapply(
        strsplit(
          gsub("[[:space:]]", "", translation) ,
          ","), 
        nchar), mean))) %>%
  filter(translation != "")

# join with complexity norms
swadesh.crit = swadesh.raw %>%
  filter(is.element(word, sw.means$word)) %>%
  left_join(sw.means) %>%
  rename(complexity = mean) %>%
  filter(ISO != "") %>%
  distinct(ISO, word)

# add number of items in each language (max == 40)
swadesh.means = swadesh.crit %>%
  left_join(swadesh.crit %>% group_by(ISO) 
            %>% summarise(n = n())) 

total.languages = swadesh.means %>%
  distinct(ISO) %>%
  summarise(count = n())

complete.languages = swadesh.means %>%
  filter(n == 40) %>%
  distinct(ISO) %>%
  summarise(count = n())

There are 4421 languages in this dataset. 980 of those languages have translations for all 40 words.

X-ling complexity bias

Determine cutoff of number of items

MIN_ITEMS = 30

subset.languages = swadesh.means %>%
  filter(n > (MIN_ITEMS-1)) %>%
  distinct(ISO) %>%
  summarise(count = n())

The cutoff is 30 items. This includes 3877 langauges in the analysis.

Get complexity bias in each language.

empirical.corrs = swadesh.means %>%
  filter(n > (MIN_ITEMS-1)) %>%
  group_by(ISO) %>%
  summarise(r.empirical = tidy(cor.test(nchar, complexity))$estimate,
            p = tidy(cor.test(nchar, complexity))$p.value,
            lci.swa = tidy(cor.test(nchar, complexity))$conf.low,
            hci.swa = tidy(cor.test(nchar, complexity))$conf.high,
            sig = ifelse(p<.05, "*", ""),
            language = tolower(names[1]),
            fam = tolower(wls_fam[1]),
            lat = lat[1],
            lon = lon[1],
            pop = pop[1]) 
  #filter(sig == "*") %>%
  #left_join(swadesh.crit %>% distinct(ISO) %>% select(ISO, wls_fam)) %>%
  #mutate(ISO = reorder(ISO,-r))

Population correlations

If anything, it looks like the bias gets bigger with larger populations. This is a different relationship than we found in the cogsci 2016 paper.

by language

all.corrs <- empirical.corrs
all.corrs.pop = all.corrs %>%
  filter(pop > 0) %>%
  mutate(log.pop = log(pop)) %>%
  filter(log.pop >5)
  
ggplot(all.corrs.pop, aes(x = log.pop, y = r.empirical)) +
  #geom_histogram() +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

cor.test(all.corrs.pop$log.pop, all.corrs.pop$r.empirical, na.rm = T)

## 
##  Pearson's product-moment correlation
## 
## data:  all.corrs.pop$log.pop and all.corrs.pop$r.empirical
## t = 9.3134, df = 3308, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1264671 0.1928666
## sample estimates:
##       cor 
## 0.1598477

summary(lmer(r.empirical ~ log.pop +  (1|fam), all.corrs.pop))

## Linear mixed model fit by REML ['lmerMod']
## Formula: r.empirical ~ log.pop + (1 | fam)
##    Data: all.corrs.pop
## 
## REML criterion at convergence: -2611.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6634 -0.6777  0.0040  0.6792  3.4021 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  fam      (Intercept) 0.007919 0.08899 
##  Residual             0.025241 0.15887 
## Number of obs: 3310, groups:  fam, 163
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 0.031135   0.014239   2.187
## log.pop     0.002982   0.001212   2.459
## 
## Correlation of Fixed Effects:
##         (Intr)
## log.pop -0.715

google.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/langLearnVar/data/lewis_2015.csv') %>%
  rename(corr.g = corr,
         lci.g = lower.ci,
         hci.g = upper.ci)

aoa.cb = read.csv('/Documents/GRADUATE_SCHOOL/Projects/ref_complex/Experiment_43/analysis/RC43_CDI/data/aoa_corrs.csv') 

swadesh.cb = empirical.corrs %>%
            rename(corr.swa = r.empirical) 


all.cb = left_join(google.cb, swadesh.cb, by = "language") %>%
                  left_join(aoa.cb, by = "language") %>%
            select(corr.g, hci.g, lci.g, corr.aoa,
                   hci.aoa, lci.aoa, corr.swa, hci.swa, lci.swa, language)

ggplot(all.cb,aes(x = corr.g,y = corr.swa)) + 
    geom_point() +
    geom_smooth(method = "lm") +
    #geom_errorbar(aes(ymin = lci.swa,ymax = hci.swa)) + 
    #geom_errorbarh(aes(xmin = lci.g,xmax = hci.g)) +
    #geom_label(aes(label = language), size = 3) +
    ylim(-.2, .6) +
    xlim(-.2, .6) +
    theme_bw(base_size = 15) +
    ggtitle("Cross-sample correlations of CB") +
    xlab("Google Translate CB Correlation") +
    ylab("Swadesh CB Correlation")

cor.test(all.cb$corr.g, all.cb$corr.swa, na.rm = T)

## 
##  Pearson's product-moment correlation
## 
## data:  all.cb$corr.g and all.cb$corr.swa
## t = 2.8563, df = 50, p-value = 0.006228
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1132077 0.5873985
## sample estimates:
##       cor 
## 0.3745369