Next Word Prediction Analysis

1. Load Packages

library(tidytext)
library(dplyr)
library(ggplot2)
library(tokenizers)

2. Sample Data Preparation

# Sample sentences
text_data <- c(
  "The quick brown fox jumps over the lazy dog",
  "A quick brown dog outpaces a lazy fox",
  "How quickly the lazy dog jumps",
  "Foxes and dogs are animals"
)

# Create tidy text dataframe
text_df <- tibble(line = 1:length(text_data), text = text_data)

3. Word Frequency Analysis

# Tokenize words
word_counts <- text_df %>%
  unnest_tokens(word, text) %>%
  count(word, sort = TRUE)

# Display top words
head(word_counts, 10) %>% 
  knitr::kable(caption = "Top 10 Words")

Top 10 Words
word	n
dog	3
lazy	3
the	3
a	2
brown	2
fox	2
jumps	2
quick	2
and	1
animals	1

4. N-Gram Analysis

# Bigram tokenization
bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

# Trigram tokenization
trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

# Show top bigrams
head(bigrams, 5) %>% 
  knitr::kable(caption = "Top 5 Bigrams")

Top 5 Bigrams
bigram	n
lazy dog	2
quick brown	2
the lazy	2
a lazy	1
a quick	1

5. Simple Prediction Function

predict_next_word <- function(input_text, n = 3) {
  # Tokenize input
  words <- tokenize_words(input_text)[[1]]
  
  # Try trigram match first
  if (length(words) >= 2) {
    last_two <- paste(tail(words, 2), collapse = " ")
    trigram_matches <- trigrams %>%
      filter(grepl(paste0("^", last_two, " "), trigram)) %>%
      head(n)
    
    if (nrow(trigram_matches) > 0) {
      return(trigram_matches$trigram)
    }
  }
  
  # Fall back to bigram
  if (length(words) >= 1) {
    last_one <- tail(words, 1)
    bigram_matches <- bigrams %>%
      filter(grepl(paste0("^", last_one, " "), bigram)) %>%
      head(n)
    
    if (nrow(bigram_matches) > 0) {
      return(bigram_matches$bigram)
    }
  }
  
  # Default to common words
  return(head(word_counts$word, n))
}

# Test prediction
test_phrase <- "quick brown"
predictions <- predict_next_word(test_phrase)
knitr::kable(data.frame(Predictions = predictions), 
             caption = paste("Next word predictions for:", test_phrase))

Next word predictions for: quick brown
Predictions
quick brown dog
quick brown fox

6. Visualization

# Plot word frequencies
word_counts %>%
  head(10) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 10 Word Frequencies", x = "", y = "Count") +
  theme_minimal()

7. How to Expand This Project

Add your own text files in the data/ folder
Use readLines() to load larger datasets
Consider these advanced packages for better performance:
```
install.packages(c("quanteda", "text2vec"))
```

Next Word Prediction Analysis

Zahed Ahmed Khalique Ahmed Khwaja

July 20, 2025

1. Load Packages

2. Sample Data Preparation

3. Word Frequency Analysis

4. N-Gram Analysis

5. Simple Prediction Function

6. Visualization

7. How to Expand This Project