library(tidytext)
library(dplyr)
library(ggplot2)
library(tokenizers)
# Sample sentences
text_data <- c(
"The quick brown fox jumps over the lazy dog",
"A quick brown dog outpaces a lazy fox",
"How quickly the lazy dog jumps",
"Foxes and dogs are animals"
)
# Create tidy text dataframe
text_df <- tibble(line = 1:length(text_data), text = text_data)
# Tokenize words
word_counts <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
# Display top words
head(word_counts, 10) %>%
knitr::kable(caption = "Top 10 Words")
| word | n |
|---|---|
| dog | 3 |
| lazy | 3 |
| the | 3 |
| a | 2 |
| brown | 2 |
| fox | 2 |
| jumps | 2 |
| quick | 2 |
| and | 1 |
| animals | 1 |
# Bigram tokenization
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
# Trigram tokenization
trigrams <- text_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
# Show top bigrams
head(bigrams, 5) %>%
knitr::kable(caption = "Top 5 Bigrams")
| bigram | n |
|---|---|
| lazy dog | 2 |
| quick brown | 2 |
| the lazy | 2 |
| a lazy | 1 |
| a quick | 1 |
predict_next_word <- function(input_text, n = 3) {
# Tokenize input
words <- tokenize_words(input_text)[[1]]
# Try trigram match first
if (length(words) >= 2) {
last_two <- paste(tail(words, 2), collapse = " ")
trigram_matches <- trigrams %>%
filter(grepl(paste0("^", last_two, " "), trigram)) %>%
head(n)
if (nrow(trigram_matches) > 0) {
return(trigram_matches$trigram)
}
}
# Fall back to bigram
if (length(words) >= 1) {
last_one <- tail(words, 1)
bigram_matches <- bigrams %>%
filter(grepl(paste0("^", last_one, " "), bigram)) %>%
head(n)
if (nrow(bigram_matches) > 0) {
return(bigram_matches$bigram)
}
}
# Default to common words
return(head(word_counts$word, n))
}
# Test prediction
test_phrase <- "quick brown"
predictions <- predict_next_word(test_phrase)
knitr::kable(data.frame(Predictions = predictions),
caption = paste("Next word predictions for:", test_phrase))
| Predictions |
|---|
| quick brown dog |
| quick brown fox |
# Plot word frequencies
word_counts %>%
head(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Word Frequencies", x = "", y = "Count") +
theme_minimal()
Add your own text files in the data/ folder
Use readLines() to load larger datasets
Consider these advanced packages for better performance:
install.packages(c("quanteda", "text2vec"))