Introduction

This report presents an exploratory analysis of text data used to build a next-word prediction algorithm.

library(stringi)
library(ggplot2)

blogs   <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news    <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
summary_df <- data.frame(
  Dataset = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(
    sum(stri_count_words(blogs)),
    sum(stri_count_words(news)),
    sum(stri_count_words(twitter))
  ),
  FileSize_MB = c(
    file.size("en_US.blogs.txt"),
    file.size("en_US.news.txt"),
    file.size("en_US.twitter.txt")
  ) / 1024^2
)
summary_df
##   Dataset   Lines    Words FileSize_MB
## 1   Blogs  899288 37546250    200.4242
## 2    News 1010242 34762395    196.2775
## 3 Twitter 2360148 30093413    159.3641
blog_words <- stri_count_words(blogs)

qplot(
  blog_words,
  bins = 50,
  main = "Distribution of Words per Blog Entry",
  xlab = "Words per Line",
  ylab = "Frequency"
)