I’ve been thinking a lot about categorical outcomes lately. For example, we know that as children get more better at language (like me), they say more kinds of different words. They often start with short expressions or communicative words (“uh-oh”, “hi”), names, and maybe some important nouns like “milk” and “train” (trains are very important). As they get into the sentence game, they gradually add more sophisticated forms like verbs, descriptive words (adjectves, adverbs), and even function words like “like”.

So let’s say I’m interested in the proportion of words in each category, and how those proportions change over developmental time. Like if a mosaic plot and a growth curve had a beautiful love child. Welcome to conditional density plots!

load("~/Documents/STUDIES/IbbotsonFreq/cache/data_eng.RData") # some data I have from a side project (sorry, I can't share it publically yet)

library(dplyr) # for the pipe: %>%
library(knitr) # for kable()

# here's what the data look like (the first 6 lines)
head(data_eng) %>% 
  kable() # format it for a pretty table :)
id age word POS child chi.count adu.count N.chi.utts N.adu.utts
Adam_27 27 adj|afraid adj Adam 0 1 1284 634
Adam_27 27 adj|all_gone adj Adam 3 1 1268 448
Adam_27 27 adj|alright adj Adam 6 8 2552 1082
Adam_27 27 adj|back adj Adam 5 12 2552 1082
Adam_27 27 adj|big adj Adam 18 7 2552 1082
Adam_27 27 adj|brief adj Adam 1 3 2552 1082
# expand it out so there's one row for each instance in a child utterance, 
# instead of having that info summarized in chi.count
data_expanded <- data_eng %>% 
  # only keep data from 24mos (the youngest in the data) to 40mos
  dplyr::filter(age < 40) %>% 
  splitstackshape::expandRows("chi.count")

# pick a handful of interesting POSs to examine, for cleaner plotting
pos <- c("n", "v", "adj", "part", "n_prop", "adv", "co", "prep", "pro_sub")

data_expanded$POS <- factor(data_expanded$POS, 
                            levels=c(pos, "other"))
# there isn't an "other" category in the original data, but I put one in as a catch-all now
# note that any level not listed above will be NA now
summary(data_expanded$POS)
##       n       v     adj    part  n_prop     adv      co    prep pro_sub 
## 3504090  101081   49578   19869  525349   85171  123822   74624   30527 
##   other    NA's 
##       0  840722
# replace those NAs with "other"
data_expanded$POS[is.na(data_expanded$POS)] <- "other"
summary(data_expanded$POS)
##       n       v     adj    part  n_prop     adv      co    prep pro_sub 
## 3504090  101081   49578   19869  525349   85171  123822   74624   30527 
##   other 
##  840722
library(ggplot2)

ggplot(data_expanded, aes(x=age, y=..count.., fill = POS, color=POS)) +
  geom_density(position = "stack", adjust=5) + 
  facet_wrap(~child, scales = "free") + 
  scale_fill_manual(values=c(rainbow(length(pos)), "#808080"), limits=c(pos, "other")) + 
  scale_color_manual(values=c(rainbow(length(pos)), "#808080"), limits=c(pos, "other"))

ggplot(data_expanded, aes(x=age, y=..count.., fill = POS, color=POS)) +
  geom_density(position = "fill", adjust=5) + 
  facet_wrap(~child, scales = "free") + 
  scale_fill_manual(values=c(rainbow(length(pos)), "#808080"), limits=c(pos, "other")) + 
  scale_color_manual(values=c(rainbow(length(pos)), "#808080"), limits=c(pos, "other")) +
  labs(title="Conditional density plot!")