The purpose of this brief report is to demonstrate early progress on
the text prediction project.
The goals are:
This report is written for a non-technical manager and focuses on major insights rather than coding details.
blogs <- readLines("final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
library(stringi)
data_summary <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
TotalWords = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
LargestLine = c(max(nchar(blogs)),
max(nchar(news)),
max(nchar(twitter)))
)
data_summary
## Dataset Lines TotalWords LargestLine
## 1 Blogs 899288 37546250 40833
## 2 News 1010242 34762395 11384
## 3 Twitter 2360148 30093372 140
library(ggplot2)
sample_blogs <- sample(blogs, 5000)
sample_words <- stri_count_words(sample_blogs)
ggplot(data.frame(words = sample_words), aes(words)) +
geom_histogram(binwidth = 5, fill = "steelblue") +
labs(title = "Distribution of Blog Sentence Word Counts",
x = "Words per line", y = "Frequency")