Load required libraries

rm(list=ls())

library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(magrittr)
library(pcaPP)
library(directlabels)

man.cols <- c("#e41a1c","#377eb8","#4daf4a",
              "#984ea3","#ff7f00","#a65628",
              "#f781bf","#999999") #Brewer Set1 without yellow

Pull in wordbank dat

## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname='wordbank',host="54.149.39.46",
                      user="wordbank",password="wordbank")

## NOW LOAD TABLES ##
source.table <- tbl(wordbank,"common_source")
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wordmapping.table <- tbl(wordbank,"common_wordmapping")
instruments.table <- tbl(wordbank,"common_instrumentsmap")
english.ws.table <- tbl(wordbank,"instruments_english_ws")
spanish.ws.table <- tbl(wordbank,"instruments_spanish_ws")
danish.ws.table <- tbl(wordbank,"instruments_danish_ws")

# Get administration info
admins <- admin.table %>%
  select(data_id,child_id,age,source_id) %>%
  rename(id = data_id, child.id = child_id, source.id = source_id) 
admins <- as.data.frame(admins)

# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
  rename(child.id = id) %>%# Rename id fields 
  as.data.frame %>%
  group_by(child.id) %>%
  rename(mom.ed = mom_ed) %>%
  filter(!is.na(mom.ed),mom.ed != "") %>%
  mutate(mom.ed = factor(mom.ed,
                         levels = c("nothing","primary","some secondary",
                                    "secondary","some college",
                                    "college","some graduate","graduate"),
                         labels = c("None", "Primary School", 
                                    "Some Secondary School", 
                                    "Secondary School",
                                    "Some College", "College", 
                                    "Some Graduate School","Graduate School")))


# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))

# Set up mappings and instruments.
mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
  rename(instrument_id = id)
items <- left_join(mapping, instruments)

Extract data for each kid

# Function for getting all of the data in wordbank 
# for a given language (kid x item).
get.language.data <- function(lang.table, lang.items, lang, child.data) {
  
  instrument.items <- lang.items %>% 
    filter(language == lang, form == 'WS') %>%
    select(item, type, category, lexical_category, definition) %>%
    mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
  
  instrument.data <- as.data.frame(lang.table) %>%
    rename(id = basetable_ptr_id) %>% # Rename the id
    gather(item, value, -id) %>% # Arrange in longform
    mutate(item = str_replace(item, "item_", "")) # Strip off item_ 
  
  d <- left_join(instrument.data, instrument.items)
  d <- left_join(d, child.data)
}

#Get (kid x item) data for all languages.
d.english <- get.language.data(lang.table=english.ws.table, 
                               lang.items=items, 
                               lang="English",
                               child.data)

d.spanish <- get.language.data(lang.table=spanish.ws.table, 
                               lang.items=items, 
                               lang="Spanish",
                               child.data)

d.danish <- get.language.data(lang.table=danish.ws.table, 
                              lang.items=items, 
                              lang="Danish",
                              child.data)

# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
         & is.na(d.danish$value),]$value = ""

#Function for getting vocab size data.
language.vocab.sizes <- function(lang.data,lang) {
  d.vocab <- lang.data %>%
    filter(type == "word") %>%
    group_by(age,mom.ed,id) %>%
    summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
              vocab.mean = vocab.sum/length(value)) %>%
    mutate(language = lang)
  
  return(d.vocab)
}

eng.production.data <- language.vocab.sizes(d.english,"English")
span.production.data <- language.vocab.sizes(d.spanish,"Spanish")
dan.production.data <- language.vocab.sizes(d.danish,"Danish")

Put together all data, plot

all.production.data <- bind_rows(eng.production.data,span.production.data)
all.production.data <- bind_rows(all.production.data,dan.production.data) %>%
  filter(!is.na(mom.ed),mom.ed != "None",mom.ed !="Primary School")

cut.by <- .25
cuts <- seq(0.0,1.0, by=cut.by)
ddd <- all.production.data %>% 
  filter(age > 15, age < 31, mom.ed != "NA") %>%
  group_by(language,age,mom.ed) %>%
  mutate(p = rank(vocab.mean)/length(vocab.mean),
         Quantile = cut(p, breaks=cuts, 
                 labels= c("First Quartile", "Second Quartile",
                           "Third Quartile", "Fourth Quartile")))
                   #cuts[2:length(cuts)]-cut.by/2))

#quartz(height=6,width=10)
ggplot(ddd, aes(x=age, y=vocab.mean,colour=mom.ed))+
  geom_jitter(size = .8,position = position_jitter(.1)) +
  geom_smooth(se=FALSE,span = 1,method="loess")+
  facet_grid(language ~ Quantile)+
  scale_x_continuous(breaks=seq(16,30,2),
                     limits=c(16,30.5),
                     name = "Age (months)")+
  scale_y_continuous(name = "Prop. Words Known") +
  scale_color_manual(values=man.cols,
                     guide = guide_legend(reverse = TRUE,
                                          title = "Mother's Education"))+
  theme_bw(base_size=13)