blogs <- readLines("en_US.blogs.txt")
news <- readLines("en_US.news.txt")
## Warning in readLines("en_US.news.txt"): incomplete final line found on
## 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt")
## Warning in readLines("en_US.twitter.txt"): incomplete final line found on
## 'en_US.twitter.txt'
summary <- data.frame(
Dataset = c("Blogs","News","Twitter"),
Lines = c(length(blogs), length(news), length(twitter))
)
summary
## Dataset Lines
## 1 Blogs 5
## 2 News 5
## 3 Twitter 5
ggplot(summary, aes(x = Dataset, y = Lines)) +
geom_bar(stat = "identity") +
ggtitle("Number of Lines")

Findings
- Twitter contains short messages.
- Blogs contain longer text.
- News contains formal language.
Future Work
- Clean the text.
- Remove punctuation.
- Build n-gram models.
- Develop a Shiny application.