Fixes for Data Science Capstone Project: Milestone 1

Word Count for Twitter, News, and Blogs Datasets

Word Count for Twitter Dataset

system("wc -l final/en_US/en_US.twitter.txt",intern=TRUE)
## [1] " 2360148 final/en_US/en_US.twitter.txt"

Word Count for News Dataset

system("wc -l final/en_US/en_US.news.txt",intern=TRUE)
## [1] " 1010242 final/en_US/en_US.news.txt"

Word Count for Blogs Dataset

system("wc -l final/en_US/en_US.blogs.txt",intern=TRUE)
## [1] "  899288 final/en_US/en_US.blogs.txt"

ggplots for word Top 20 Single, Double, and Triple Words

Top 20 Single Words

library(ggplot2)
load("one.gram.20.Rdata")
ggplot(one.gram.20, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="blue") +geom_text(aes(label=Frequency), vjust=-0.2)

plot of chunk unnamed-chunk-3

Top 20 Double Words

load("two.gram.20.Rdata")
ggplot(two.gram.20, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="light blue") +geom_text(aes(label=Frequency), vjust=-0.2)

plot of chunk unnamed-chunk-4

Top 20 Triple Words

load("three.gram.20.Rdata")
ggplot(three.gram.20, aes(x=Word,y=Frequency), ) + geom_bar(stat="Identity", fill="green") +geom_text(aes(label=Frequency), vjust=-0.2)

plot of chunk unnamed-chunk-5