This project explores text data from blogs, news, and Twitter.
The goal is to build a next-word prediction model.
set.seed(123)
read_sample <- function(file, n = 10000) {
con <- file(file, "r")
lines <- readLines(con, n = n, encoding = "UTF-8", skipNul = TRUE)
close(con)
return(lines)
}
blogs <- read_sample("en_US.blogs.txt", 10000)
news <- read_sample("en_US.news.txt", 10000)
twitter <- read_sample("en_US.twitter.txt", 10000)
summary_table <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
),
AvgWords = c(
mean(stri_count_words(blogs)),
mean(stri_count_words(news)),
mean(stri_count_words(twitter))
)
)
knitr::kable(summary_table)
| Dataset | Lines | Words | AvgWords |
|---|---|---|---|
| Blogs | 10000 | 413215 | 41.3215 |
| News | 10000 | 349062 | 34.9062 |
| 10000 | 126736 | 12.6736 |
blogs_wc <- stri_count_words(blogs)
ggplot(data.frame(x = blogs_wc), aes(x)) +
geom_histogram(binwidth = 10, fill = "steelblue") +
labs(title = "Blogs Word Distribution", x = "Words", y = "Frequency")
news_wc <- stri_count_words(news)
ggplot(data.frame(x = news_wc), aes(x)) +
geom_histogram(binwidth = 10, fill = "darkgreen") +
labs(title = "News Word Distribution", x = "Words", y = "Frequency")
twitter_wc <- stri_count_words(twitter)
ggplot(data.frame(x = twitter_wc), aes(x)) +
geom_histogram(binwidth = 5, fill = "tomato") +
labs(title = "Twitter Word Distribution", x = "Words", y = "Frequency")
We will build an N-gram model:
The Shiny app will:
This analysis prepares the dataset for building a predictive text application.