Preliminaries:
library(plyr)
library(ggplot2)
theme_set(theme_bw()) # better ggplot theme
df = read.csv("data/CDMlangsurvey_analysis.csv") # read in data
x <- as.matrix(df$Word) #set Words as matrix
counts <- table(x) #establish frequencies
par(ps = 8, cex = 1, cex.main = 1) #text size
barplot(counts[which(counts>5)], las = 2,
main = "Frequency of Words", xlab = "Words") #draw barplot, exclude words with fewer than 5 occurences
This is a totally reasonable way of doing the computation. Here’s an alternative in another idiom, using ddply with summarise and qplot. (Note that I also sorted so that the levels aren’t in alphabetical order - the syntax for this is annoying but it’s worth it).
## an alternate way of doing this
freqs <- ddply(df, .(Word), summarise, count=length(Word))
freqs$Word <- factor(freqs$Word,
levels=with(freqs,
Word[order(count, Word, decreasing = TRUE)]))
qplot(Word, count, geom="bar", stat="identity",
data=subset(freqs, count>5))
Now, here’s why you’d do it that way; the same code can be broken down by other factors. Though note that sorting the levels got a bunch more annoying here, and I had to add an extra “total count” variable.
freqs <- ddply(df, .(Word,gender), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs$Word <- factor(freqs$Word,
levels=unique(with(freqs,
Word[order(total.count,
Word, decreasing = TRUE)])))
qplot(Word, count, fill=gender,
position="dodge",
geom="bar", stat="identity",
data=subset(freqs,total.count>5 &
gender!=""))
Again, a totally fine way of doing the analysis…
#Age_utterance Frequencies
y <- as.matrix(df$Age_utterance) #set age of utterance as matrix
counts2 <- table(y) #frequencies
barplot(counts2, las = 2, ylim = c(0, 250), main = "Ages of Utterances") #draw barplot
text(2, 240, "236") #These are adding the counts above the bars
text(3.2, 85, "81")
text(4.3, 61, "57")
text(5.5, 34, "30")
text(6.75, 58, "54")
text(8, 45, "41")
Here’s another way to do it, using the same plyr stuff.
ages <- ddply(df, .(Age_utterance), summarise, count=length(Word))
qplot(Age_utterance, count,
geom="bar", stat="identity",
data=ages) +
geom_text(aes(label=count,y=count+8))
We can do the same histogram thing with age of the utterance… might slice the data a little thin.
freqs <- ddply(df, .(Word,Age_utterance), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs$Word <- factor(freqs$Word,
levels=unique(with(freqs,
Word[order(total.count, Word,
decreasing = TRUE)])))
qplot(Word, count, fill=Age_utterance,
position="dodge",
geom="bar", stat="identity",
data=subset(freqs,total.count>5 &
Age_utterance!="" &
Age_utterance!="dontremember"))
Looks like we want proportions instead.
freqs <- ddply(df, .(Word,Age_utterance), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs <- ddply(freqs, .(Age_utterance), mutate, prop = count / sum(count))
freqs$Word <- factor(freqs$Word,
levels=unique(with(freqs,
Word[order(total.count, Word,
decreasing = TRUE)])))
qplot(Word, prop, fill=Age_utterance,
position="dodge",
geom="bar", stat="identity",
data=subset(freqs,total.count>5 &
Age_utterance!="" &
Age_utterance!="dontremember"))
And the data are sparse, so we can do an age split.
df$yearplus <- df$Age_utterance =="12m" |
df$Age_utterance =="13m" |
df$Age_utterance =="14mOrLater"
freqs <- ddply(df, .(Word,yearplus), summarise, count=length(Word))
freqs <- ddply(freqs, .(Word), mutate, total.count = sum(count))
freqs <- ddply(freqs, .(yearplus), mutate, prop = count / sum(count))
freqs$Word <- factor(freqs$Word,
levels=unique(with(freqs,
Word[order(total.count, Word,
decreasing = TRUE)])))
qplot(Word, prop, fill=yearplus,
position="dodge",
geom="bar", stat="identity",
data=subset(freqs,total.count>5))
#Subject frequencies
z <- as.matrix(df$Subject)
counts3 <- table(z)
barplot(counts3, las = 2, ylim = c(0, 300), main = "Frequency of Subjects")