This script shows the basics of how to connect to the Wordbank database, load data from it, and work with that data. We illustrate this with two sample analyses: the first works with aggregate vocabulary size data and breaks down vocabulary development by the child’s sex, and the second works with individual item data and looks at the trajectories of color words over age.
Load required libraries.
library(dplyr)
library(directlabels)
library(RMySQL)
library(tidyr)
library(bootstrap)
library(ggplot2)
library(RCurl)
Get a script that provides interface functions for pulling data out of Wordbank.
url <- 'https://raw.githubusercontent.com/langcog/wordbank/crossling/shiny_apps/data_loading.R'
script <- getURL(url, ssl.verifypeer = FALSE)
eval(parse(text = script))
Connect to the Wordbank database.
wordbank <- src_mysql(dbname='wordbank', host="54.149.39.46", #"54.200.225.86"
user="wordbank", password="wordbank")
Get all the common tables.
common.tables <- get.common.tables(wordbank)
Get all the instrument tables.
instrument.tables <- get.instrument.tables(wordbank, common.tables$instrumentsmap)
Load in administration data.
admins <- get.administration.data(common.tables$momed, common.tables$child,
common.tables$instrumentsmap, common.tables$administration) %>%
filter(!is.na(sex))
Number of administrations by sex for each instrument:
| language | form | female | male |
|---|---|---|---|
| Danish | WS | 1900 | 1814 |
| English | WG | 1149 | 1226 |
| English | WS | 2027 | 2151 |
| Norwegian | WG | 1503 | 1522 |
| Norwegian | WS | 6519 | 6450 |
| Spanish | WG | 373 | 405 |
| Spanish | WS | 541 | 553 |
Load in item data.
items <- get.item.data(common.tables$wordmapping,
common.tables$instrumentsmap)
Number of items by category for each instrument:
| language | form | word | word_form | complexity |
|---|---|---|---|---|
| Danish | WS | 725 | 29 | 33 |
| English | WG | 396 | 0 | 0 |
| English | WS | 680 | 25 | 37 |
| Norwegian | WG | 395 | 0 | 0 |
| Norwegian | WS | 731 | 33 | 42 |
| Spanish | WG | 428 | 0 | 0 |
| Spanish | WS | 680 | 24 | 37 |
Get by-child productive vocabulary data.
num.words <- items %>%
filter(form == "WS", type == "word") %>%
group_by(language) %>%
summarise(n = n())
vocab.data <- admins %>%
select(admin.id, language, form, age, sex, production) %>%
filter(form == "WS", age >= 16, age <= 30) %>%
group_by(language, sex, age) %>%
left_join(num.words) %>%
mutate(production = as.numeric(production)/n) %>%
summarise_each(funs(na.median, ci.median.low, ci.median.high), production)
What these data look like for a given age:
| language | sex | age | median | confidence interval (lower) | confidence interval (upper) |
|---|---|---|---|---|---|
| Danish | F | 24 | 0.402 | 0.070 | 0.038 |
| Danish | M | 24 | 0.311 | 0.048 | 0.058 |
| English | F | 24 | 0.575 | 0.049 | 0.043 |
| English | M | 24 | 0.406 | 0.037 | 0.069 |
| Norwegian | F | 24 | 0.456 | 0.036 | 0.029 |
| Norwegian | M | 24 | 0.377 | 0.031 | 0.043 |
| Spanish | F | 24 | 0.304 | 0.057 | 0.165 |
| Spanish | M | 24 | 0.318 | 0.106 | 0.126 |
Plot median productive vocabulary as a function of age.
ggplot(vocab.data,
aes(x=age, y=na.median,colour=sex,label=sex))+
facet_wrap(~ language) +
geom_pointrange(aes(ymin = na.median-ci.median.low,
ymax = na.median+ci.median.high),
size = .8,
show_guide = FALSE) +
geom_line(size=1) +
scale_x_continuous(breaks=seq(16,30,2),limits=c(16,31),
name = "Age (months)")+
scale_y_continuous(name = "Median Productive Vocabulary (proportion of total words)",
limits=c(0,1)) +
theme_bw(base_size=14) +
theme(legend.position="none", panel.grid=element_blank()) +
geom_dl(method = list(dl.trans(x=x +.2),"last.qp",cex=1)) +
scale_color_manual(values = c("#2c7bb6", "#d7191c"))
Set up color terms of each language.
colors.danish <- c("r\xf8d", "gul", "gr\xf8n", "bl\xe5", "sort", "hvid") # "lille" "brun"
colors.english <- c("red", "yellow", "green", "blue", "black", "white") # "orange" "brown"
colors.norwegian <- c ("r\xf8d", "gul", "gr\xf8nn" ,"bl\xe5", "svart", "hvit") # "brun" "oransje"
colors.spanish <- c("rojo", "amarillo", "verde", "azul", "negro", "blanco") # "rosa" "morado"
colors <- list("Danish" = colors.danish,
"English" = colors.english,
"Norwegian" = colors.norwegian,
"Spanish" = colors.spanish)
Function for getting color data for a given language.
get.language.data <- function(lang) {
lang.table <- filter(instrument.tables, language==lang, form=="WS")$table[[1]]
lang.colors <- colors[[lang]]
lang.items <- select(filter(items, language==lang, form=="WS"),
definition, item.id)
lang.color.items <- data.frame(color = colors.english, definition = lang.colors) %>%
left_join(lang.items)
lang.data <- get.instrument.data(lang.table, lang.color.items$item.id) %>%
mutate(language = lang,
item.id = paste("item_", item.id, sep="")) %>%
left_join(lang.color.items)
return(lang.data)
}
Load color data for each language and put it all together.
languages <- unique(instrument.tables$language)
color.data <- bind_rows(sapply(languages, get.language.data, simplify=FALSE))
color.byage.data <- color.data %>%
left_join(filter(admins, form == "WS", age >= 16, age <= 30)) %>%
group_by(language, color, age) %>%
mutate(produces = value == "produces") %>%
summarise_each(funs(na.mean, ci.mean.high, ci.mean.low), produces)
What these data look like for a given age and language:
| language | color | age | median | confidence interval (lower) | confidence interval (upper) |
|---|---|---|---|---|---|
| English | black | 24 | 0.370 | 0.047 | 0.040 |
| English | blue | 24 | 0.630 | 0.045 | 0.045 |
| English | green | 24 | 0.516 | 0.047 | 0.047 |
| English | red | 24 | 0.558 | 0.047 | 0.047 |
| English | white | 24 | 0.308 | 0.045 | 0.042 |
| English | yellow | 24 | 0.541 | 0.042 | 0.047 |
Plot the proportion of kids that produce each color as a function of their age.
ggplot(color.byage.data, aes(x=age, y=na.mean, color=color, label=color)) +
facet_wrap(~ language) +
geom_pointrange(aes(ymin = na.mean-ci.mean.low,
ymax = na.mean+ci.mean.high),
size = .8,
show_guide = FALSE) +
geom_line(size=1) +
theme_bw(base_size=14) +
theme(legend.position="none", panel.grid=element_blank()) +
geom_dl(method = list(dl.trans(x=x +.2), "last.qp", cex=.8)) +
scale_color_manual(values=c("black", "#2c7bb6", "#1a9641", "#d7191c", "gray87", "gold1")) +
scale_x_continuous(breaks=seq(16,30,2), limits=c(16,31.5),
name = "Age (months)")+
scale_y_continuous(name = "Proportion of Children Producing",
limits=c(0,1))