This milestone report presents exploratory analysis of the English text datasets used for building a next-word prediction model and Shiny application. The objective is to understand dataset size, structure, and word distribution.
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
stats <- data.frame(
File = c("Blogs","News","Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter)))
)
stats
## File Lines Words
## 1 Blogs 899288 37546806
## 2 News 1010206 34761151
## 3 Twitter 2360148 30096649
sizes_mb <- round(file.info(c("en_US.blogs.txt",
"en_US.news.txt",
"en_US.twitter.txt"))$size / 1024^2, 2)
data.frame(File=c("Blogs","News","Twitter"), Size_MB=sizes_mb)
## File Size_MB
## 1 Blogs 200.42
## 2 News 196.28
## 3 Twitter 159.36
set.seed(123)
sample_words <- stri_count_words(sample(blogs, min(1000, length(blogs))))
hist(sample_words,
main="Words per Line Distribution (Blogs Sample)",
xlab="Words per line")
Next steps: