Analysis of CDM vocab survey data

Preliminaries:

library(plyr)
library(ggplot2)
theme_set(theme_bw()) # better ggplot theme
df = read.csv("data/CDMlangsurvey_analysis.csv") # read in data

Word Frequencies

x <- as.matrix(df$Word) #set Words as matrix
counts <- table(x) #establish frequencies
par(ps = 8, cex = 1, cex.main = 1) #text size
barplot(counts[which(counts>5)], las = 2,  
        main = "Frequency of Words", xlab = "Words") #draw barplot, exclude words with fewer than 5 occurences

plot of chunk unnamed-chunk-2

This is a totally reasonable way of doing the computation. Here’s an alternative in another idiom, using ddply with summarise and qplot. (Note that I also sorted so that the levels aren’t in alphabetical order - the syntax for this is annoying but it’s worth it).

## an alternate way of doing this
freqs <- ddply(df, .(Word), summarise, count=length(Word))
freqs$Word <- factor(freqs$Word, 
                     levels=with(freqs, 
                                 Word[order(count, Word, decreasing = TRUE)]))
qplot(Word, count, geom="bar", stat="identity",
      data=subset(freqs, count>5))

plot of chunk unnamed-chunk-3

Now, here’s why you’d do it that way; the same code can be broken down by other factors. Though note that sorting the levels got a bunch more annoying here, and I had to add an extra “total count” variable.

freqs <- ddply(df, .(Word,gender), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs$Word <- factor(freqs$Word, 
                     levels=unique(with(freqs, 
                                 Word[order(total.count, 
                                            Word, decreasing = TRUE)])))
qplot(Word, count, fill=gender, 
      position="dodge",
      geom="bar", stat="identity", 
      data=subset(freqs,total.count>5 &
                  gender!=""))

plot of chunk unnamed-chunk-4

Age Data

Again, a totally fine way of doing the analysis…

#Age_utterance Frequencies
y <- as.matrix(df$Age_utterance) #set age of utterance as matrix 
counts2 <- table(y) #frequencies 
barplot(counts2, las = 2, ylim = c(0, 250), main = "Ages of Utterances") #draw barplot
text(2, 240, "236") #These are adding the counts above the bars 
text(3.2, 85, "81") 
text(4.3, 61, "57") 
text(5.5, 34, "30") 
text(6.75, 58, "54")
text(8, 45, "41")

plot of chunk unnamed-chunk-5

Here’s another way to do it, using the same plyr stuff.

ages <- ddply(df, .(Age_utterance), summarise, count=length(Word))
qplot(Age_utterance, count, 
      geom="bar", stat="identity",
      data=ages) + 
  geom_text(aes(label=count,y=count+8))

plot of chunk unnamed-chunk-6

We can do the same histogram thing with age of the utterance… might slice the data a little thin.

freqs <- ddply(df, .(Word,Age_utterance), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs$Word <- factor(freqs$Word, 
                     levels=unique(with(freqs, 
                                 Word[order(total.count, Word, 
                                            decreasing = TRUE)])))
qplot(Word, count, fill=Age_utterance, 
      position="dodge",
      geom="bar", stat="identity", 
      data=subset(freqs,total.count>5 &
                  Age_utterance!="" & 
                    Age_utterance!="dontremember"))

plot of chunk unnamed-chunk-7

Looks like we want proportions instead.

freqs <- ddply(df, .(Word,Age_utterance), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs <- ddply(freqs, .(Age_utterance), mutate, prop = count / sum(count))

freqs$Word <- factor(freqs$Word, 
                     levels=unique(with(freqs, 
                                 Word[order(total.count, Word, 
                                            decreasing = TRUE)])))
qplot(Word, prop, fill=Age_utterance, 
      position="dodge",
      geom="bar", stat="identity", 
      data=subset(freqs,total.count>5 &
                  Age_utterance!="" & 
                    Age_utterance!="dontremember"))

plot of chunk unnamed-chunk-8

And the data are sparse, so we can do an age split.

df$yearplus <- df$Age_utterance =="12m" |
  df$Age_utterance =="13m" |
  df$Age_utterance =="14mOrLater"
freqs <- ddply(df, .(Word,yearplus), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs <- ddply(freqs, .(yearplus), mutate, prop = count / sum(count))

freqs$Word <- factor(freqs$Word, 
                     levels=unique(with(freqs, 
                                 Word[order(total.count, Word, 
                                            decreasing = TRUE)])))
qplot(Word, prop, fill=yearplus, 
      position="dodge",
      geom="bar", stat="identity", 
      data=subset(freqs,total.count>5))

plot of chunk unnamed-chunk-9

Subject Breakdown

#Subject frequencies 
z <- as.matrix(df$Subject)
counts3 <- table(z)
barplot(counts3, las = 2, ylim = c(0, 300), main = "Frequency of Subjects")

plot of chunk unnamed-chunk-10