The goal of this project is to analyze text data from blogs, news, and Twitter and later build a predictive text model and Shiny application.
library(stringi)
library(ggplot2)
blogs <- readLines(
"final/en_US/en_US.blogs.txt",
n = 5000,
encoding = "UTF-8",
skipNul = TRUE
)
news <- readLines(
"final/en_US/en_US.news.txt",
n = 5000,
encoding = "UTF-8",
skipNul = TRUE
)
twitter <- readLines(
"final/en_US/en_US.twitter.txt",
n = 5000,
encoding = "UTF-8",
skipNul = TRUE
)
stats <- data.frame(
File = c("Blogs", "News", "Twitter"),
Lines = c(
length(blogs),
length(news),
length(twitter)
),
Words = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
)
)
stats
## File Lines Words
## 1 Blogs 5000 206913
## 2 News 5000 173566
## 3 Twitter 5000 63252
sample_blogs <- sample(blogs, 1000)
word_count <- stri_count_words(sample_blogs)
qplot(
word_count,
bins = 30,
main = "Histogram of Word Counts",
xlab = "Words Per Line"
)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
The next step is to clean the data, create n-gram models, and build a Shiny application for next-word prediction.