library(tidytext)
library(dplyr)
library(ggplot2)
library(tokenizers)
# Sample sentences
text_data <- c(
"The quick brown fox jumps over the lazy dog",
"A quick brown dog outpaces a lazy fox",
"How quickly the lazy dog jumps",
"Foxes and dogs are animals"
)
# Create tidy text dataframe
text_df <- tibble(line = 1:length(text_data), text = text_data)
# Tokenize words
word_counts <- text_df %>%
unnest_tokens(word, text) %>%
count(word, sort = TRUE)
# Display top words
head(word_counts, 10) %>%
knitr::kable(caption = "Top 10 Words")
word | n |
---|---|
dog | 3 |
lazy | 3 |
the | 3 |
a | 2 |
brown | 2 |
fox | 2 |
jumps | 2 |
quick | 2 |
and | 1 |
animals | 1 |
# Bigram tokenization
bigrams <- text_df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
count(bigram, sort = TRUE)
# Trigram tokenization
trigrams <- text_df %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
count(trigram, sort = TRUE)
# Show top bigrams
head(bigrams, 5) %>%
knitr::kable(caption = "Top 5 Bigrams")
bigram | n |
---|---|
lazy dog | 2 |
quick brown | 2 |
the lazy | 2 |
a lazy | 1 |
a quick | 1 |
predict_next_word <- function(input_text, n = 3) {
# Tokenize input
words <- tokenize_words(input_text)[[1]]
# Try trigram match first
if (length(words) >= 2) {
last_two <- paste(tail(words, 2), collapse = " ")
trigram_matches <- trigrams %>%
filter(grepl(paste0("^", last_two, " "), trigram)) %>%
head(n)
if (nrow(trigram_matches) > 0) {
return(trigram_matches$trigram)
}
}
# Fall back to bigram
if (length(words) >= 1) {
last_one <- tail(words, 1)
bigram_matches <- bigrams %>%
filter(grepl(paste0("^", last_one, " "), bigram)) %>%
head(n)
if (nrow(bigram_matches) > 0) {
return(bigram_matches$bigram)
}
}
# Default to common words
return(head(word_counts$word, n))
}
# Test prediction
test_phrase <- "quick brown"
predictions <- predict_next_word(test_phrase)
knitr::kable(data.frame(Predictions = predictions),
caption = paste("Next word predictions for:", test_phrase))
Predictions |
---|
quick brown dog |
quick brown fox |
# Plot word frequencies
word_counts %>%
head(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top 10 Word Frequencies", x = "", y = "Count") +
theme_minimal()
Add your own text files in the data/
folder
Use readLines()
to load larger datasets
Consider these advanced packages for better performance:
install.packages(c("quanteda", "text2vec"))