This project is part of the Data Science Capstone to build a predictive model for next-word suggestions. The data consists of text from blogs, news, and Twitter.
blogs_file <- "./final/en_US/en_US.blogs.txt"
news_file <- "./final/en_US/en_US.news.txt"
twitter_file <- "./final/en_US/en_US.twitter.txt"
blogs <- readLines(blogs_file, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news_file, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE)
data_summary <- data.frame(
Source = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter)),
Words = c(sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))),
Characters = c(sum(nchar(blogs)),
sum(nchar(news)),
sum(nchar(twitter)))
)
knitr::kable(data_summary, caption = "Basic statistics for the datasets")
| Source | Lines | Words | Characters |
|---|---|---|---|
| Blogs | 899288 | 37546250 | 206824505 |
| News | 1010242 | 34762395 | 203223159 |
| 2360148 | 30093413 | 162096241 |
📌 Takeaway: Blogs have the most characters and longest lines; Twitter has the shortest due to platform constraints.
set.seed(2025)
sample_size <- 5000
text_sample <- c(sample(blogs, sample_size), sample(news, sample_size), sample(twitter, sample_size))
corpus <- VCorpus(VectorSource(text_sample))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, content_transformer(removePunctuation))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)
📌 Takeaway: Cleaning reduces noise and helps meaningful pattern extraction.
profanity <- readLines("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt")
corpus <- tm_map(corpus, removeWords, profanity)
📌 Takeaway: Important for ensuring the final app is appropriate and user-friendly.
tdm <- TermDocumentMatrix(corpus)
tdm_m <- as.matrix(tdm)
word_freq <- sort(rowSums(tdm_m), decreasing = TRUE)
freq_df <- data.frame(word = names(word_freq), freq = word_freq)
top_words <- head(freq_df, 15)
ggplot(top_words, aes(x = reorder(word, freq), y = freq)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 15 Frequent Words", x = "Words", y = "Frequency")
📌 Takeaway: Shows most commonly used words post-cleaning. Useful to identify stopwords and dominant topics.
sample_df <- tibble(text = sapply(corpus, as.character))
get_ngrams <- function(data, n) {
data %>%
unnest_tokens(ngram, text, token = "ngrams", n = n) %>%
count(ngram, sort = TRUE)
}
unigram_df <- get_ngrams(sample_df, 1)
bigram_df <- get_ngrams(sample_df, 2)
trigram_df <- get_ngrams(sample_df, 3)
As we created these in our milestone .Rmd file or interactive R session: N-gram Compute Save (Unigram, Bigram & Trigram), we save it to avoid re-running the entire code.
#unigram_df <- get_ngrams(sample_df, 1)
#bigram_df <- get_ngrams(sample_df, 2)
#trigram_df <- get_ngrams(sample_df, 3)
saveRDS(unigram_df, file = "unigram_df.rds")
saveRDS(bigram_df, file = "bigram_df.rds")
saveRDS(trigram_df, file = "trigram_df.rds")
unigram_df %>% top_n(15) %>%
ggplot(aes(x = reorder(ngram, n), y = n)) +
geom_col(fill = "red") + coord_flip() +
labs(title = "Top 15 Unigrams", x = "Bigram", y = "Frequency")
bigram_df %>% top_n(15) %>%
ggplot(aes(x = reorder(ngram, n), y = n)) +
geom_col(fill = "purple") + coord_flip() +
labs(title = "Top 15 Bigrams", x = "Bigram", y = "Frequency")
trigram_df %>% top_n(15) %>%
ggplot(aes(x = reorder(ngram, n), y = n)) +
geom_col(fill = "darkgreen") + coord_flip() +
labs(title = "Top 15 Trigrams", x = "Trigram", y = "Frequency")
📌 Takeaway: Bigrams and trigrams help uncover word pairs/triples that appear frequently together — a key insight for next-word prediction.
predict_next_word <- function(input, ngram_df) {
input <- tolower(input)
input <- tail(strsplit(input, " ")[[1]], 2)
match_str <- paste(input, collapse = " ")
filtered <- ngram_df[grepl(paste0("^", match_str), ngram_df$ngram), ]
head(filtered[order(-filtered$n), ], 3)
}
predict_next_word("i love", trigram_df)
## # A tibble: 0 × 2
## # ℹ 2 variables: ngram <chr>, n <int>
predict_next_word("thanks for", trigram_df)
## # A tibble: 0 × 2
## # ℹ 2 variables: ngram <chr>, n <int>
📌 Takeaway: Shows how prediction can be done using frequency-based filtering. Refined models will use backoff and smoothing.
line_lengths <- nchar(text_sample)
ggplot(data.frame(lengths = line_lengths), aes(x = lengths)) +
geom_histogram(bins = 50, fill = "tomato", color = "white") +
labs(title = "Distribution of Line Lengths", x = "Line Length (chars)", y = "Frequency")
📌 Takeaway: Helps in designing limits for real-time inputs in a web app (e.g., Shiny app).
Simple UI with a text box
Predict next word on-the-fly
Display top 3 suggestions in ranked order
Use reactive data tables behind the scenes
tm