Loading the data
blogs <- readLines("en_US.blogs.txt", encoding="UTF-8", skipNul=TRUE)
news <- readLines("en_US.news.txt", encoding="UTF-8", skipNul=TRUE)
## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", encoding="UTF-8", skipNul=TRUE)
Line Count
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
The File Sizes
file.info("en_US.blogs.txt")$size / 1024^2
## [1] 200.4242
file.info("en_US.news.txt")$size / 1024^2
## [1] 196.2775
file.info("en_US.twitter.txt")$size / 1024^2
## [1] 159.3641
WORD COUNT OF EACH DATA SET
blogs_words <- sum(sapply(strsplit(blogs, "\\s+"), length))
blogs_words
## [1] 37334131
news_words <- sum(sapply(strsplit(news, "\\s+"), length))
news_words
## [1] 2643969
twitter_words <- sum(sapply(strsplit(twitter, "\\s+"), length))
twitter_words
## [1] 30373583
Blog dataset histogram
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
blogs_length <- nchar(blogs)
hist(blogs_length, main = "Blog Text Length Distribution", xlab = "Character Count")
News dataset histogram
news_length <- nchar(news)
hist(news_length,
main = "News Line Length Distribution",
xlab = "Characters per line",
col = "lightgreen",
breaks = 50,
ylim = c(0, 5000))
Twitter dataset histogram
twitter_length <- nchar(twitter)
hist(twitter_length,
main = "Twitter Line Length Distribution",
xlab = "Characters per line",
col = "lightpink",
breaks = 50,
xlim = c(0, 100),
ylim = c(0, 5000))