This report explores three text datasets: blogs, news, and Twitter.
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
blogs_words <- sum(sapply(strsplit(blogs, " "), length))
news_words <- sum(sapply(strsplit(news, " "), length))
twitter_words <- sum(sapply(strsplit(twitter, " "), length))
summary_data <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(blogs_words, news_words, twitter_words)
)
summary_data
## File Lines Words
## 1 Blogs 899288 37334131
## 2 News 1010206 34371031
## 3 Twitter 2360148 30373583
blog_sample <- blogs[1:1000]
blog_words <- sapply(strsplit(blog_sample, " "), length)
hist(blog_words,
main = "Word Count Distribution (Blogs Sample)",
xlab = "Words per line")