Complexity norms

Load data

files = dir("../../production-results/")
d = data.frame()
for (i in 1:length(files)[1]) {
    s <- as.data.frame(fromJSON(paste("../../production-results/", files[i], sep = "")))
    d = rbind(d, s)
}

# clean up names
names(d) = unlist(strsplit(names(d), "rs."))[unlist(strsplit(names(d), "rs."))
                                             != "answe"]

Munge.

d_all = d %>%
  gather(variable, value, contains("_")) %>%
  mutate(trial_num =  unlist(lapply(strsplit(as.character(variable),
                                      "_"),function(x) x[2])),
         variable = unlist(lapply(strsplit(as.character(variable),
                                      "_"),function(x) x[1]))) %>%
  spread(variable, value) %>%
  mutate(value = as.numeric(value))

d_all$wordLength = nchar(d_all$word)

d_sw = filter(d_all, condition == "swadesh") #Split datasets by study.

There are 197 participants in this study.

Anchor word complexity ratings

d_sw %>%
  filter(word == "motherboard"| word == "ball") %>%
  group_by(word) %>%
  multi_boot_standard(column = "value", na.rm = T) %>%
  ggplot(aes(y = mean, x = word)) +
  geom_pointrange(aes(ymax = ci_upper, ymin = ci_lower)) +
  ylab("mean complexity")+
  ylim(1,7) +
  theme_bw(base_size = 18)

d_sw = d_sw  %>%
        filter(word != "motherboard")  %>%
        filter(word != "ball") 

Response distribution

d_sw %>%
    ggplot(aes(x=value)) +
    geom_density(fill = "red") +
    geom_vline(aes(xintercept = mean(value))) +
    xlim(1,7) +
    theme_bw()

Complexity bias in English

sw.means = d_sw %>%
  mutate(word = trimws(word)) %>%
  group_by(word) %>%
  multi_boot_standard(column = "value")  %>%
  mutate(wordLength.eng = nchar(word))

ggplot(sw.means, aes(y = wordLength.eng, x = mean )) +
  geom_label(aes(label = word),position = "jitter") +
  geom_smooth(method = "lm") +
  xlim(1,7) +
  ylab("word length (char)") +
  xlab("Mean complexity rating") +
  theme_bw(base_size = 18)

ggplot(sw.means, aes(x = wordLength.eng, y = mean )) +
  geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper), position = "jitter") +
  ylim(1,7) +
  xlab("word length (char)") +
  ylab("Mean complexity rating") +
  theme_bw(base_size = 18) +
  coord_flip() 

Stats.

Word length is correlated with complexity in English.

tidy(cor.test(sw.means$wordLength.eng, sw.means$mean))
##    estimate statistic    p.value parameter  conf.low conf.high
## 1 0.3168701  2.059443 0.04635121        38 0.0059478 0.5719258

Translations

Merge with all translations

# note- this is asjp coding system.
# [308 small languages have no ISO, leaving 4421 (TOTAL:4729)]

# load in swadesh translations, and gather
swadesh.raw = read.csv("wichmann_2013.csv") %>% 
  select(1,1:109) %>%
  gather(word,translation,I:name) 

# get translation length
swadesh.raw =  swadesh.raw %>%
  mutate(nchar = unlist(
    lapply(
      lapply(
        strsplit(
          gsub("[[:space:]]", "", translation) ,
          ","), 
        nchar), mean))) %>%
  filter(translation != "")

# join with complexity norms
swadesh.crit = swadesh.raw %>%
  filter(is.element(word, sw.means$word)) %>%
  left_join(sw.means) %>%
  rename(complexity = mean) %>%
  filter(ISO != "") %>%
  distinct(ISO, word)

# add number of items in each language (max == 40)
swadesh.means = swadesh.crit %>%
  left_join(swadesh.crit %>% group_by(ISO) 
            %>% summarise(n = n())) 

total.languages = swadesh.means %>%
  distinct(ISO) %>%
  summarise(count = n())

complete.languages = swadesh.means %>%
  filter(n == 40) %>%
  distinct(ISO) %>%
  summarise(count = n())

There are 4421 languages in this dataset. 980 of those languages have translations for all 40 words.

X-ling complexity bias

Determine cutoff of number of items

MIN_ITEMS = 30

subset.languages = swadesh.means %>%
  filter(n > (MIN_ITEMS-1)) %>%
  distinct(ISO) %>%
  summarise(count = n())

The cutoff is 30 items. This includes 3877 langauges in the analysis.

Get complexity bias in each language.

empirical.corrs = swadesh.means %>%
  filter(n > (MIN_ITEMS-1)) %>%
  group_by(ISO) %>%
  summarise(r.empirical = tidy(cor.test(nchar, complexity))$estimate,
            p = tidy(cor.test(nchar, complexity))$p.value,
            sig = ifelse(p<.05, "*", ""),
            language = tolower(names[1]),
            fam = tolower(wls_fam[1]),
            lat = lat[1],
            lon = lon[1],
            pop = pop[1]) 
  #filter(sig == "*") %>%
  #left_join(swadesh.crit %>% distinct(ISO) %>% select(ISO, wls_fam)) %>%
  #mutate(ISO = reorder(ISO,-r)) 

Get “max” and “min” complexity bias in each language.

# importantly, these are the limits given the spread of lengths for these words. In principle, the bias could be large/smaller if the variance were different.

get_corr_limits <- function(lang_data) {
  if (nrow(lang_data) > 10) { 
    sorted.nchar = sort(lang_data$nchar)
    sorted.complexity = sort(lang_data$complexity)
    r.max = tidy(cor.test(sorted.nchar, sorted.complexity))$estimate
    r.min = tidy(cor.test(sorted.nchar, rev(sorted.complexity)))$estimate
  } else {
    r.max = NA
    r.min = NA
  }

  data.frame(r.max =  r.max,
             r.min =  r.min,
             row.names = NULL)
}

limits.corrs <- swadesh.means %>%
  filter(n > (MIN_ITEMS-1)) %>%
  split(.$ISO) %>%
  map(function(lang_data) get_corr_limits(lang_data)) %>%
  bind_rows(.id = "ISO")  %>%
  filter(ISO != "")

Bootstrap baseline for each language.

# for each langauge, shuffle words, complexity and get r x nboot
# take mean
# look at distribution over mean rs

NBOOTS <- 100

one_sample <- function(lang_data) {
  function(k) {
    if (nrow(lang_data) > 10) { # fix this 
      shuffled.nchar = sample_n(lang_data,nrow(lang_data), replace = F)$nchar
      tidy(cor.test(shuffled.nchar, lang_data$complexity))$estimate
    } else {
      NA
    }
  }
}

all_samples <- function(lang_data, nboot) {
  sample_values <- 1:nboot %>%
    map(one_sample(lang_data)) %>%
    unlist()
  data.frame(mean = mean(sample_values, na.rm = T),
             ci_lower = ci_lower(sample_values,  na.rm = T),
             ci_upper = ci_upper(sample_values,  na.rm = T),
             row.names = NULL)
}

baseline.mean.corrs <- swadesh.means %>%
  filter(n > (MIN_ITEMS-1)) %>%
  split(.$ISO) %>%
  map(function(lang_data) all_samples(lang_data, NBOOTS)) %>%
  bind_rows(.id = "ISO") %>%
  filter(!is.nan(mean)) %>%
  rename(r.baseline = mean)

by language

all.corrs = left_join(empirical.corrs, baseline.mean.corrs) %>%
            left_join(limits.corrs)

all.corrs %>%
  gather("sample", "r", c(2,10,13,14)) %>%
  ggplot(aes(r, group = sample, fill = sample)) +
  #geom_histogram() +
  geom_density(alpha = .4) +
  geom_vline(aes(xintercept = 0)) +
  geom_vline(aes(xintercept = -1)) +
  geom_vline(aes(xintercept = 1)) +
  theme_bw()

# paired t-test
t.test(all.corrs$r.baseline, all.corrs$r.empirical, paired = T)
## 
##  Paired t-test
## 
## data:  all.corrs$r.baseline and all.corrs$r.empirical
## t = -34.939, df = 3876, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09843406 -0.08797393
## sample estimates:
## mean of the differences 
##               -0.093204

Should do a non-parametric test here? (e.g. wilcoxin)

by family

all.corrs.fam = all.corrs %>%
  group_by(fam) %>%
        summarise(r.empirical = mean(r.empirical),
                  r.baseline = mean(r.baseline),
                  r.max = mean(r.max),
                  r.min = mean(r.min),
                  pop = pop[1])

all.corrs.fam %>%
  gather("sample", "r", c(r.empirical, r.baseline, r.max, r.min)) %>%
  ggplot(aes(r, group = sample, fill = sample)) +
  #geom_histogram() +
  geom_density(alpha = .4) +
  geom_vline(aes(xintercept = 0)) +
  geom_vline(aes(xintercept = -1)) +
  geom_vline(aes(xintercept = 1)) +
  theme_bw()

# paired t-test
t.test(all.corrs.fam$r.baseline,all.corrs.fam$r.empirical, paired = T)
## 
##  Paired t-test
## 
## data:  all.corrs.fam$r.baseline and all.corrs.fam$r.empirical
## t = -5.9015, df = 219, p-value = 1.357e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.08438880 -0.04213518
## sample estimates:
## mean of the differences 
##             -0.06326199

Population correlations

If anything, it looks like the bias gets bigger with larger populations. This is a different relationship than we found in teh cogsci 2016 paper.

by language

all.corrs.pop = all.corrs %>%
  filter(pop > 0) %>%
  mutate(log.pop = log(pop)) %>%
  filter(log.pop >5)
  
ggplot(all.corrs.pop, aes(x = log.pop, y = r.empirical)) +
  #geom_histogram() +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

cor.test(all.corrs.pop$log.pop, all.corrs.pop$r.empirical, na.rm = T)
## 
##  Pearson's product-moment correlation
## 
## data:  all.corrs.pop$log.pop and all.corrs.pop$r.empirical
## t = 5.625, df = 3308, p-value = 2.009e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.06347734 0.13097107
## sample estimates:
##        cor 
## 0.09733612
summary(lmer(r.empirical ~ log.pop +  (1|fam), all.corrs.pop))
## Linear mixed model fit by REML ['lmerMod']
## Formula: r.empirical ~ log.pop + (1 | fam)
##    Data: all.corrs.pop
## 
## REML criterion at convergence: -2872.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4753 -0.6610  0.0300  0.6841  3.1410 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  fam      (Intercept) 0.01005  0.1002  
##  Residual             0.02314  0.1521  
## Number of obs: 3310, groups:  fam, 163
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.0818360  0.0143698   5.695
## log.pop     -0.0009608  0.0011672  -0.823
## 
## Correlation of Fixed Effects:
##         (Intr)
## log.pop -0.677

by family

all.corrs.pop = all.corrs.fam %>%
  filter(pop > 0) %>%
  mutate(log.pop = log(pop)) %>%
  filter(log.pop >5)
  
ggplot(all.corrs.pop, aes(x = log.pop, y = r.empirical)) +
  #geom_histogram() +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_bw()

cor.test(all.corrs.pop$log.pop,all.corrs.pop$r.empirical, na.rm = T)
## 
##  Pearson's product-moment correlation
## 
## data:  all.corrs.pop$log.pop and all.corrs.pop$r.empirical
## t = 2.3647, df = 146, p-value = 0.01936
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03170071 0.34278385
## sample estimates:
##       cor 
## 0.1920622

MISC

# ggplot(corrs, aes(x = ISO, y= r, fill = wls_fam)) +
#   geom_bar(stat= "identity") +
#   theme_bw() +
#   theme(axis.text.x = element_text(angle = 90, hjust = 1),
#           legend.position = "none")