Syllabifying Words

Load required libraries.

library(boot)
library(dplyr)
library(directlabels)
library(RMySQL)
library(tidyr)
library(bootstrap)
library(ggplot2)
library(RCurl)
library(magrittr)
library(readr)
library(stringr)
library(PerformanceAnalytics)
library(cowplot)
library(langcog)

Get a script that provides interface functions for pulling data out of Wordbank.

url <- 'https://raw.githubusercontent.com/langcog/wordbank/master/shiny_apps/data_loading.R'
script <- getURL(url, ssl.verifypeer = FALSE)
eval(parse(text = script))

Connect to the Wordbank database.

wordbank <- connect.to.wordbank("prod")

Load tables

common.tables <- get.common.tables(wordbank)
instrument.tables <- get.instrument.tables(wordbank, common.tables)

admins <- get.administration.data(common.tables)
items <- get.item.data(common.tables)

Filter down to appropriate kids

vocab.admins <- admins %>%
  select(data_id, language, form, age, sex, production) %>%
  filter(form == "WG")

Function that get’s one language’s data from wordbank

get.language.data <- function(lang, vocab.data) {
  
  lang.table <- filter(instrument.tables, language==lang, form=="WG")$table[[1]]
  
  words <- items %>%
    filter(type == "word", language == lang, form == "WG") %>%
    select(definition, item.id, uni_lemma, category, lexical_category) %>%
    rename(item_id = item.id)
  
  lang.data <- lang.table %>%
    filter(basetable_ptr_id %in% vocab.data$data_id) %>%
    select_(.dots=c("basetable_ptr_id", words$item_id)) %>%
    as.data.frame %>% 
    gather(item_id, value, -basetable_ptr_id) %>%
    rename(data_id = basetable_ptr_id) %>%
    mutate(value = ifelse(is.na(value), "", value)) %>%
    left_join(vocab.data)
  
  return(left_join(lang.data, words))

  }

Get all data

languages <- c("English", "Spanish", "Swedish")

all.data <- bind_rows(sapply(languages, 
                             function(lang) get.language.data(lang,vocab.admins),
                             simplify = FALSE))

Compute acquisition order

order.data <- all.data %>%
  ungroup() %>%
  mutate(language = factor(language)) %>%
  group_by(language, uni_lemma,definition,item_id) %>%
  summarise(produces = mean(value == "produces"),
            understands = mean(value == "produces" | value == "understands"),
            only.understands = mean(value == "understands")) %>%
  summarise(produces = max(produces),
            understands = max(understands),
            only.understands = max(only.understands)) %>%
  gather(measure, prop, produces, understands, only.understands) %>%
  group_by(language, measure) %>%
  mutate(order = rank(-prop)) %>%
  arrange(order)

Get frequency and phoneme data and merge.

# eng.phons <- read_delim('mrc.phons.txt',delim='\t')
# 
# eng.freqs <- read_csv('english.freqs.csv') %>%
#   gather(word, count) %>%
#   group_by(word) %>%
#   summarize(frequency = mean(count)) %>%
#   ungroup() %>%
#   mutate(word = str_trim(tolower(gsub('\\.', ' ', word)))) %>%
#   left_join(eng.phons)
# 
# eng.cats <- all.data %>%
#   filter(language=="English") %>%
#   select(uni_lemma,category,lexical_category) %>%
#   distinct()

For exporting data for Syllabification

# out.data <- order.data %>%
#   filter(language == "English", measure == "produces") %>%
#   select(-measure, language)
# out.data[out.data$definition == "daddy*","definition"] = "daddy"
# out.data[out.data$definition == "mommy*","definition"] = "mommy"
# out.data[out.data$definition == "grandma*","definition"] = "grandma"
# out.data[out.data$definition == "grandpa*","definition"] = "grandpa"
# 
# out.all <- left_join(left_join(out.data,eng.freqs,
#                                by=c("uni_lemma" = "word")),
#                      eng.cats, by=c("uni_lemma" = "uni_lemma"))
# 
# write.csv(out.all,"eng.out.all.csv")

Load all data

eng.data <- read_csv('data/eng.out.all.csv')

swe.data <- read_csv('data/Swedish_CDI_syls.csv') %>%
  select(-category, -lexical_category)

swe.cats <- all.data %>%
  filter(language=="Swedish") %>%
  select(definition,uni_lemma,category,lexical_category) %>%
  distinct()

swe.data <- left_join(swe.cats,swe.data) %>%
  mutate(language = "Swedish")


span.data <- read_csv('data/spanish.sylls.csv') %>%
  mutate(language = "Spanish")

span.cats <- all.data %>%
  filter(language=="Spanish") %>%
  select(definition,uni_lemma,category,lexical_category) %>%
  distinct()

span.data <- left_join(span.cats,span.data)

syll.data <- bind_rows(eng.data,span.data,swe.data) %>%
  select(language,definition,uni_lemma,category,lexical_category,
         order,prop,frequency,syllables,phones)
# 
# comp.data <- eng.data %>%
#   select(-prop,-order,-measure,-definition) %>%
#   left_join(filter(order.data,measure == "understands"))
# 
# comp.data <- bind_rows(span.data) %>%
#   select(-prop,-order,-uni_lemma) %>%
#   left_join(filter(order.data,measure=="understands")) %>%
#   bind_rows(comp.data) %>%
#   select(-measure)
# 
# syll.count.data <- syll.data %>%
#   mutate(num.syllables = str_count(syllables, " ") + 1) %>%
#   filter(!is.na(num.syllables), num.syllables > 1)
# 
# phon.count.data <- syll.data %>%
#   rowwise() %>%
#   mutate(num.phones = nchar(gsub(' ','', syllables))) %>%
#   group_by(language,num.phones) %>%
#   summarise_each(funs(mean,sem),prop)
# 
# syll.in.lang <- syll.count.data %>%
#   group_by(language,num.syllables) %>%
#   summarise(n = n())
#   
# quartz(width=6,height=4)
# ggplot(phon.count.data, 
#        aes(x = num.phones, y = mean, color = language, label=language,
#            fill = language)) +
#   geom_pointrange(aes(ymax = mean + sem, ymin = mean-sem))+
#   geom_line() +
#   facet_grid(~ language) + 
#   theme(legend.position = "none")
#   geom_dl(method = list("smart.grid", cex=1))+ 
#   scale_color_brewer(palette = "Set1") +
#   scale_fill_brewer(palette = "Set1") +
#   theme(legend.position = "none") +
#   geom_ribbon(aes(ymax = ci.upper,ymin = ci.lower),
#               data = baseline.samples.stats,alpha = .25,linetype=0) +
#   geom_line(data = baseline.samples.stats) +
#   scale_x_continuous(name = "Acquisition order") +
#   scale_y_continuous(name = "Prop. words with a unique syllable")

Analysis

# Remove stress markings
clean.data <- syll.data %>%
  mutate(syllables = str_trim(gsub('\'', '', gsub('\\,', '', syllables)))) %>%
  filter(!is.na(syllables)) %>%
  arrange(language,order)

# Get all of the unique syllables in a language
get.dict <- function(lang,data=clean.data) {
  per.word <- sapply(filter(data,language == lang)$syllables,
                   function(x) {str_split(x," ")[[1]]},
                   USE.NAMES = FALSE,simplify=TRUE)
  dict <- unique(unlist(per.word))
  return(dict[!is.na(dict)])
}

# Make a word x syllable matrix. A 1 indicates that word i has syllable j 
get.syllable.counts <- function(lang,data=clean.data) {
  dict <- get.dict(lang,data)
  words <- filter(data,language == lang)
  
  counts <- as.data.frame(sapply(dict,function(syllable) {
    as.numeric(str_count(words$syllables,syllable) > 0)
    },simplify = "matrix"))
}

# Determine if a word has at least one unique syllable so far in acqusition
has.unique.syllable <- function(lang,data=clean.data) {
    
  words <- filter(data,language == lang)
  counts <- get.syllable.counts(lang,data)
  cumsum.counts <- cumsum(counts)
  
  cumsum.counts.offset <- rbind(rep(0,ncol(counts)),
    cumsum.counts[1:(nrow(cumsum.counts)-1),])
  
  unique.counts = (cumsum.counts == 1) & (cumsum.counts.offset == 0)
  
  words$unique.syllable <- rowSums(unique.counts) > 0
  
  return(words)
}

# Unique syllables in each language
unique.syllables <- bind_rows(lapply(languages, has.unique.syllable)) %>%
  group_by(language) %>%
  mutate(prop.unique = cummean(unique.syllable))

#Bootstrap random samples
baseline.sample <- function() {
  data <- clean.data %<>% 
    group_by(language) %>%
    sample_frac()
  
  bind_rows(lapply(languages, function(x) {has.unique.syllable(x,data)})) %>%
    group_by(language) %>%
    mutate(prop.unique = cummean(unique.syllable)) %>%
    ungroup() %>%
    select(prop.unique) %>%
    data.frame()
    
}

# Take samples randomly re-ordering words in acqusition
baseline.samples <-bind_cols(replicate(100,baseline.sample(),simplify=FALSE))
names(baseline.samples) <- as.character(seq(1,ncol(baseline.samples)))

# Compute 95% CIs over samples
baseline.samples.stats <- baseline.samples %>%
  mutate(row = 1:nrow(baseline.samples)) %>%
  gather(sample,prop,-row) %>%
  group_by(row) %>%
  summarise_each(funs(mean,ci.upper,ci.lower),prop) %>%
  mutate(language = unique.syllables$language) %>%
  group_by(language) %>%
  mutate(order = 1:n()) %>%
  select(-row) %>%
  rename(prop.unique = mean)

# Average words in the same acquisition order
plotting.unique.syllables <- unique.syllables %>%
  group_by(language) %>%
  mutate(order = 1:n()) %>%
  group_by(language,order) %>%
  summarise(prop.unique = mean(prop.unique))

Plot

# quartz(width=6,height=4)
ggplot(plotting.unique.syllables, 
       aes(x = order, y = prop.unique, color = language, label=language,
           fill = language)) +
  geom_point(size = 1)+
  geom_dl(method = list("smart.grid", cex=1))+ 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "none") +
  geom_ribbon(aes(ymax = ci.upper,ymin = ci.lower),
              data = baseline.samples.stats,alpha = .25,linetype=0) +
  geom_line(data = baseline.samples.stats) +
  scale_x_continuous(name = "Acquisition order") +
  scale_y_continuous(name = "Prop. words with a unique syllable")