Syllabifying Words
Load required libraries.
library(boot)
library(dplyr)
library(directlabels)
library(RMySQL)
library(tidyr)
library(bootstrap)
library(ggplot2)
library(RCurl)
library(magrittr)
library(readr)
library(stringr)
library(PerformanceAnalytics)
library(cowplot)
library(langcog)
Get a script that provides interface functions for pulling data out of Wordbank.
url <- 'https://raw.githubusercontent.com/langcog/wordbank/master/shiny_apps/data_loading.R'
script <- getURL(url, ssl.verifypeer = FALSE)
eval(parse(text = script))
Connect to the Wordbank database.
wordbank <- connect.to.wordbank("prod")
Load tables
common.tables <- get.common.tables(wordbank)
instrument.tables <- get.instrument.tables(wordbank, common.tables)
admins <- get.administration.data(common.tables)
items <- get.item.data(common.tables)
Filter down to appropriate kids
vocab.admins <- admins %>%
select(data_id, language, form, age, sex, production) %>%
filter(form == "WG")
Function that get’s one language’s data from wordbank
get.language.data <- function(lang, vocab.data) {
lang.table <- filter(instrument.tables, language==lang, form=="WG")$table[[1]]
words <- items %>%
filter(type == "word", language == lang, form == "WG") %>%
select(definition, item.id, uni_lemma, category, lexical_category) %>%
rename(item_id = item.id)
lang.data <- lang.table %>%
filter(basetable_ptr_id %in% vocab.data$data_id) %>%
select_(.dots=c("basetable_ptr_id", words$item_id)) %>%
as.data.frame %>%
gather(item_id, value, -basetable_ptr_id) %>%
rename(data_id = basetable_ptr_id) %>%
mutate(value = ifelse(is.na(value), "", value)) %>%
left_join(vocab.data)
return(left_join(lang.data, words))
}
Get all data
languages <- c("English", "Spanish", "Swedish")
all.data <- bind_rows(sapply(languages,
function(lang) get.language.data(lang,vocab.admins),
simplify = FALSE))
Compute acquisition order
order.data <- all.data %>%
ungroup() %>%
mutate(language = factor(language)) %>%
group_by(language, uni_lemma,definition,item_id) %>%
summarise(produces = mean(value == "produces"),
understands = mean(value == "produces" | value == "understands"),
only.understands = mean(value == "understands")) %>%
summarise(produces = max(produces),
understands = max(understands),
only.understands = max(only.understands)) %>%
gather(measure, prop, produces, understands, only.understands) %>%
group_by(language, measure) %>%
mutate(order = rank(-prop)) %>%
arrange(order)
Get frequency and phoneme data and merge.
# eng.phons <- read_delim('mrc.phons.txt',delim='\t')
#
# eng.freqs <- read_csv('english.freqs.csv') %>%
# gather(word, count) %>%
# group_by(word) %>%
# summarize(frequency = mean(count)) %>%
# ungroup() %>%
# mutate(word = str_trim(tolower(gsub('\\.', ' ', word)))) %>%
# left_join(eng.phons)
#
# eng.cats <- all.data %>%
# filter(language=="English") %>%
# select(uni_lemma,category,lexical_category) %>%
# distinct()
For exporting data for Syllabification
# out.data <- order.data %>%
# filter(language == "English", measure == "produces") %>%
# select(-measure, language)
# out.data[out.data$definition == "daddy*","definition"] = "daddy"
# out.data[out.data$definition == "mommy*","definition"] = "mommy"
# out.data[out.data$definition == "grandma*","definition"] = "grandma"
# out.data[out.data$definition == "grandpa*","definition"] = "grandpa"
#
# out.all <- left_join(left_join(out.data,eng.freqs,
# by=c("uni_lemma" = "word")),
# eng.cats, by=c("uni_lemma" = "uni_lemma"))
#
# write.csv(out.all,"eng.out.all.csv")
Load all data
eng.data <- read_csv('data/eng.out.all.csv')
swe.data <- read_csv('data/Swedish_CDI_syls.csv') %>%
select(-category, -lexical_category)
swe.cats <- all.data %>%
filter(language=="Swedish") %>%
select(definition,uni_lemma,category,lexical_category) %>%
distinct()
swe.data <- left_join(swe.cats,swe.data) %>%
mutate(language = "Swedish")
span.data <- read_csv('data/spanish.sylls.csv') %>%
mutate(language = "Spanish")
span.cats <- all.data %>%
filter(language=="Spanish") %>%
select(definition,uni_lemma,category,lexical_category) %>%
distinct()
span.data <- left_join(span.cats,span.data)
syll.data <- bind_rows(eng.data,span.data,swe.data) %>%
select(language,definition,uni_lemma,category,lexical_category,
order,prop,frequency,syllables,phones)
#
# comp.data <- eng.data %>%
# select(-prop,-order,-measure,-definition) %>%
# left_join(filter(order.data,measure == "understands"))
#
# comp.data <- bind_rows(span.data) %>%
# select(-prop,-order,-uni_lemma) %>%
# left_join(filter(order.data,measure=="understands")) %>%
# bind_rows(comp.data) %>%
# select(-measure)
#
# syll.count.data <- syll.data %>%
# mutate(num.syllables = str_count(syllables, " ") + 1) %>%
# filter(!is.na(num.syllables), num.syllables > 1)
#
# phon.count.data <- syll.data %>%
# rowwise() %>%
# mutate(num.phones = nchar(gsub(' ','', syllables))) %>%
# group_by(language,num.phones) %>%
# summarise_each(funs(mean,sem),prop)
#
# syll.in.lang <- syll.count.data %>%
# group_by(language,num.syllables) %>%
# summarise(n = n())
#
# quartz(width=6,height=4)
# ggplot(phon.count.data,
# aes(x = num.phones, y = mean, color = language, label=language,
# fill = language)) +
# geom_pointrange(aes(ymax = mean + sem, ymin = mean-sem))+
# geom_line() +
# facet_grid(~ language) +
# theme(legend.position = "none")
# geom_dl(method = list("smart.grid", cex=1))+
# scale_color_brewer(palette = "Set1") +
# scale_fill_brewer(palette = "Set1") +
# theme(legend.position = "none") +
# geom_ribbon(aes(ymax = ci.upper,ymin = ci.lower),
# data = baseline.samples.stats,alpha = .25,linetype=0) +
# geom_line(data = baseline.samples.stats) +
# scale_x_continuous(name = "Acquisition order") +
# scale_y_continuous(name = "Prop. words with a unique syllable")
Analysis
# Remove stress markings
clean.data <- syll.data %>%
mutate(syllables = str_trim(gsub('\'', '', gsub('\\,', '', syllables)))) %>%
filter(!is.na(syllables)) %>%
arrange(language,order)
# Get all of the unique syllables in a language
get.dict <- function(lang,data=clean.data) {
per.word <- sapply(filter(data,language == lang)$syllables,
function(x) {str_split(x," ")[[1]]},
USE.NAMES = FALSE,simplify=TRUE)
dict <- unique(unlist(per.word))
return(dict[!is.na(dict)])
}
# Make a word x syllable matrix. A 1 indicates that word i has syllable j
get.syllable.counts <- function(lang,data=clean.data) {
dict <- get.dict(lang,data)
words <- filter(data,language == lang)
counts <- as.data.frame(sapply(dict,function(syllable) {
as.numeric(str_count(words$syllables,syllable) > 0)
},simplify = "matrix"))
}
# Determine if a word has at least one unique syllable so far in acqusition
has.unique.syllable <- function(lang,data=clean.data) {
words <- filter(data,language == lang)
counts <- get.syllable.counts(lang,data)
cumsum.counts <- cumsum(counts)
cumsum.counts.offset <- rbind(rep(0,ncol(counts)),
cumsum.counts[1:(nrow(cumsum.counts)-1),])
unique.counts = (cumsum.counts == 1) & (cumsum.counts.offset == 0)
words$unique.syllable <- rowSums(unique.counts) > 0
return(words)
}
# Unique syllables in each language
unique.syllables <- bind_rows(lapply(languages, has.unique.syllable)) %>%
group_by(language) %>%
mutate(prop.unique = cummean(unique.syllable))
#Bootstrap random samples
baseline.sample <- function() {
data <- clean.data %<>%
group_by(language) %>%
sample_frac()
bind_rows(lapply(languages, function(x) {has.unique.syllable(x,data)})) %>%
group_by(language) %>%
mutate(prop.unique = cummean(unique.syllable)) %>%
ungroup() %>%
select(prop.unique) %>%
data.frame()
}
# Take samples randomly re-ordering words in acqusition
baseline.samples <-bind_cols(replicate(100,baseline.sample(),simplify=FALSE))
names(baseline.samples) <- as.character(seq(1,ncol(baseline.samples)))
# Compute 95% CIs over samples
baseline.samples.stats <- baseline.samples %>%
mutate(row = 1:nrow(baseline.samples)) %>%
gather(sample,prop,-row) %>%
group_by(row) %>%
summarise_each(funs(mean,ci.upper,ci.lower),prop) %>%
mutate(language = unique.syllables$language) %>%
group_by(language) %>%
mutate(order = 1:n()) %>%
select(-row) %>%
rename(prop.unique = mean)
# Average words in the same acquisition order
plotting.unique.syllables <- unique.syllables %>%
group_by(language) %>%
mutate(order = 1:n()) %>%
group_by(language,order) %>%
summarise(prop.unique = mean(prop.unique))
Plot
# quartz(width=6,height=4)
ggplot(plotting.unique.syllables,
aes(x = order, y = prop.unique, color = language, label=language,
fill = language)) +
geom_point(size = 1)+
geom_dl(method = list("smart.grid", cex=1))+
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "none") +
geom_ribbon(aes(ymax = ci.upper,ymin = ci.lower),
data = baseline.samples.stats,alpha = .25,linetype=0) +
geom_line(data = baseline.samples.stats) +
scale_x_continuous(name = "Acquisition order") +
scale_y_continuous(name = "Prop. words with a unique syllable")