This report presents an exploratory analysis of text data used to build a next-word prediction algorithm.
library(stringi)
library(ggplot2)
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
summary_df <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
),
FileSize_MB = c(
file.size("en_US.blogs.txt"),
file.size("en_US.news.txt"),
file.size("en_US.twitter.txt")
) / 1024^2
)
summary_df
## Dataset Lines Words FileSize_MB
## 1 Blogs 899288 37546250 200.4242
## 2 News 1010242 34762395 196.2775
## 3 Twitter 2360148 30093413 159.3641
blog_words <- stri_count_words(blogs)
qplot(
blog_words,
bins = 50,
main = "Distribution of Words per Blog Entry",
xlab = "Words per Line",
ylab = "Frequency"
)