Loading the data

blogs <- readLines("en_US.blogs.txt", encoding="UTF-8", skipNul=TRUE)
news <- readLines("en_US.news.txt", encoding="UTF-8", skipNul=TRUE)
## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", encoding="UTF-8", skipNul=TRUE)

Line Count

length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148

The File Sizes

file.info("en_US.blogs.txt")$size / 1024^2
## [1] 200.4242
file.info("en_US.news.txt")$size / 1024^2
## [1] 196.2775
file.info("en_US.twitter.txt")$size / 1024^2
## [1] 159.3641

WORD COUNT OF EACH DATA SET

blogs_words <- sum(sapply(strsplit(blogs, "\\s+"), length))
blogs_words
## [1] 37334131
news_words <- sum(sapply(strsplit(news, "\\s+"), length))
news_words
## [1] 2643969
twitter_words <- sum(sapply(strsplit(twitter, "\\s+"), length))
twitter_words
## [1] 30373583

Blog dataset histogram

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)

blogs_length <- nchar(blogs)

hist(blogs_length, main = "Blog Text Length Distribution", xlab = "Character Count")

News dataset histogram

news_length <- nchar(news)
hist(news_length,
     main = "News Line Length Distribution",
     xlab = "Characters per line",
     col = "lightgreen",
     breaks = 50,
     ylim = c(0, 5000))

Twitter dataset histogram

twitter_length <- nchar(twitter)
hist(twitter_length,
main = "Twitter Line Length Distribution",
xlab = "Characters per line",
col = "lightpink",
breaks = 50,
xlim = c(0, 100),
ylim = c(0, 5000))