Load required libraries
rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(magrittr)
library(pcaPP)
library(directlabels)
man.cols <- c("#e41a1c","#377eb8","#4daf4a",
"#984ea3","#ff7f00","#a65628",
"#f781bf","#999999") #Brewer Set1 without yellow
Pull in wordbank dat
## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname='wordbank',host="54.149.39.46",
user="wordbank",password="wordbank")
## NOW LOAD TABLES ##
source.table <- tbl(wordbank,"common_source")
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wordmapping.table <- tbl(wordbank,"common_wordmapping")
instruments.table <- tbl(wordbank,"common_instrumentsmap")
english.ws.table <- tbl(wordbank,"instruments_english_ws")
spanish.ws.table <- tbl(wordbank,"instruments_spanish_ws")
danish.ws.table <- tbl(wordbank,"instruments_danish_ws")
# Get administration info
admins <- admin.table %>%
select(data_id,child_id,age,source_id) %>%
rename(id = data_id, child.id = child_id, source.id = source_id)
admins <- as.data.frame(admins)
# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
rename(child.id = id) %>%# Rename id fields
as.data.frame %>%
group_by(child.id) %>%
rename(mom.ed = mom_ed) %>%
filter(!is.na(mom.ed),mom.ed != "") %>%
mutate(mom.ed = factor(mom.ed,
levels = c("nothing","primary","some secondary",
"secondary","some college",
"college","some graduate","graduate"),
labels = c("None", "Primary School",
"Some Secondary School",
"Secondary School",
"Some College", "College",
"Some Graduate School","Graduate School")))
# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))
# Set up mappings and instruments.
mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
rename(instrument_id = id)
items <- left_join(mapping, instruments)
Extract data for each kid
# Function for getting all of the data in wordbank
# for a given language (kid x item).
get.language.data <- function(lang.table, lang.items, lang, child.data) {
instrument.items <- lang.items %>%
filter(language == lang, form == 'WS') %>%
select(item, type, category, lexical_category, definition) %>%
mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
instrument.data <- as.data.frame(lang.table) %>%
rename(id = basetable_ptr_id) %>% # Rename the id
gather(item, value, -id) %>% # Arrange in longform
mutate(item = str_replace(item, "item_", "")) # Strip off item_
d <- left_join(instrument.data, instrument.items)
d <- left_join(d, child.data)
}
#Get (kid x item) data for all languages.
d.english <- get.language.data(lang.table=english.ws.table,
lang.items=items,
lang="English",
child.data)
d.spanish <- get.language.data(lang.table=spanish.ws.table,
lang.items=items,
lang="Spanish",
child.data)
d.danish <- get.language.data(lang.table=danish.ws.table,
lang.items=items,
lang="Danish",
child.data)
# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
& is.na(d.danish$value),]$value = ""
#Function for getting vocab size data.
language.vocab.sizes <- function(lang.data,lang) {
d.vocab <- lang.data %>%
filter(type == "word") %>%
group_by(age,mom.ed,id) %>%
summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
vocab.mean = vocab.sum/length(value)) %>%
mutate(language = lang)
return(d.vocab)
}
eng.production.data <- language.vocab.sizes(d.english,"English")
span.production.data <- language.vocab.sizes(d.spanish,"Spanish")
dan.production.data <- language.vocab.sizes(d.danish,"Danish")
Put together all data, plot
all.production.data <- bind_rows(eng.production.data,span.production.data)
all.production.data <- bind_rows(all.production.data,dan.production.data) %>%
filter(!is.na(mom.ed),mom.ed != "None",mom.ed !="Primary School")
cut.by <- .25
cuts <- seq(0.0,1.0, by=cut.by)
ddd <- all.production.data %>%
filter(age > 15, age < 31, mom.ed != "NA") %>%
group_by(language,age,mom.ed) %>%
mutate(p = rank(vocab.mean)/length(vocab.mean),
Quantile = cut(p, breaks=cuts,
labels= c("First Quartile", "Second Quartile",
"Third Quartile", "Fourth Quartile")))
#cuts[2:length(cuts)]-cut.by/2))
#quartz(height=6,width=10)
ggplot(ddd, aes(x=age, y=vocab.mean,colour=mom.ed))+
geom_jitter(size = .8,position = position_jitter(.1)) +
geom_smooth(se=FALSE,span = 1,method="loess")+
facet_grid(language ~ Quantile)+
scale_x_continuous(breaks=seq(16,30,2),
limits=c(16,30.5),
name = "Age (months)")+
scale_y_continuous(name = "Prop. Words Known") +
scale_color_manual(values=man.cols,
guide = guide_legend(reverse = TRUE,
title = "Mother's Education"))+
theme_bw(base_size=13)