1. Load Packages

library(tidytext)
library(dplyr)
library(ggplot2)
library(tokenizers)

2. Sample Data Preparation

# Sample sentences
text_data <- c(
  "The quick brown fox jumps over the lazy dog",
  "A quick brown dog outpaces a lazy fox",
  "How quickly the lazy dog jumps",
  "Foxes and dogs are animals"
)

# Create tidy text dataframe
text_df <- tibble(line = 1:length(text_data), text = text_data)

3. Word Frequency Analysis

# Tokenize words
word_counts <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

# Display top words
head(word_counts, 10) %>% 
  knitr::kable(caption = "Top 10 Words")
Top 10 Words
word n
dog 3
lazy 3
the 3
a 2
brown 2
fox 2
jumps 2
quick 2
and 1
animals 1

4. N-Gram Analysis

# Bigram tokenization
bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

# Trigram tokenization
trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

# Show top bigrams
head(bigrams, 5) %>% 
  knitr::kable(caption = "Top 5 Bigrams")
Top 5 Bigrams
bigram n
lazy dog 2
quick brown 2
the lazy 2
a lazy 1
a quick 1

5. Simple Prediction Function

predict_next_word <- function(input_text, n = 3) {
  # Tokenize input
  words <- tokenize_words(input_text)[[1]]
  
  # Try trigram match first
  if (length(words) >= 2) {
    last_two <- paste(tail(words, 2), collapse = " ")
    trigram_matches <- trigrams %>%
      filter(grepl(paste0("^", last_two, " "), trigram)) %>%
      head(n)
    
    if (nrow(trigram_matches) > 0) {
      return(trigram_matches$trigram)
    }
  }
  
  # Fall back to bigram
  if (length(words) >= 1) {
    last_one <- tail(words, 1)
    bigram_matches <- bigrams %>%
      filter(grepl(paste0("^", last_one, " "), bigram)) %>%
      head(n)
    
    if (nrow(bigram_matches) > 0) {
      return(bigram_matches$bigram)
    }
  }
  
  # Default to common words
  return(head(word_counts$word, n))
}

# Test prediction
test_phrase <- "quick brown"
predictions <- predict_next_word(test_phrase)
knitr::kable(data.frame(Predictions = predictions), 
             caption = paste("Next word predictions for:", test_phrase))
Next word predictions for: quick brown
Predictions
quick brown dog
quick brown fox

6. Visualization

# Plot word frequencies
word_counts %>%
  head(10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Word Frequencies", x = "", y = "Count") +
  theme_minimal()

7. How to Expand This Project

  1. Add your own text files in the data/ folder

  2. Use readLines() to load larger datasets

  3. Consider these advanced packages for better performance:

    install.packages(c("quanteda", "text2vec"))