2025-10-23
Next Word Predictor is a Shiny app that uses an N-gram language model to predict the next word in a phrase.
library(shiny)
library(ggplot2)
library(dplyr)
library(tidytext)
library(tibble)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(flexdashboard) # For gauge
# ===========================
# Load and Prepare Data
# ===========================
data("crude")
corpus <- VCorpus(VectorSource(crude))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
text_df <- tibble(text = sapply(corpus, as.character))
# Unigrams, Bigrams, Trigrams
unigrams <- text_df %>% unnest_tokens(word, text, token = "words") %>% count(word, sort = TRUE)
bigrams <- text_df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% count(bigram, sort = TRUE)
trigrams <- text_df %>% unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% count(trigram, sort = TRUE)
bigram_freq <- bigrams %>% tidyr::separate(bigram, into = c("w1", "w2"), sep = " ")
trigram_freq <- trigrams %>% tidyr::separate(trigram, into = c("w1", "w2", "w3"), sep = " ")
# ===========================
# Prediction Function (Backoff)
# ===========================
predict_next_word <- function(input_text, top_n = 3) {
input_text <- tolower(input_text)
words <- strsplit(input_text, "\\s+")[[1]]
len <- length(words)
# Try trigram
if (len >= 2) {
w1 <- words[len - 1]
w2 <- words[len]
trigram_matches <- trigram_freq %>%
filter(w1 == !!w1, w2 == !!w2) %>%
arrange(desc(n)) %>%
head(top_n)
if (nrow(trigram_matches) > 0) return(trigram_matches$w3)
}
# Backoff to bigram
if (len >= 1) {
w2 <- words[len]
bigram_matches <- bigram_freq %>%
filter(w1 == !!w2) %>%
arrange(desc(n)) %>%
head(top_n)
if (nrow(bigram_matches) > 0) return(bigram_matches$w2)
}
# Backoff to unigram
unigram_matches <- unigrams %>%
arrange(desc(n)) %>%
head(top_n)
return(unigram_matches$word)
}
# ===========================
# Entropy Calculation
# ===========================
calculate_entropy <- function(predictions) {
probs <- rep(1 / length(predictions), length(predictions)) # uniform since no probabilities
entropy <- -sum(probs * log2(probs))
return(entropy)
}
# ===========================
# Shiny UI
# ===========================
ui <- fluidPage(
titlePanel("Next Word Predictor (N-gram Model)"),
sidebarLayout(
sidebarPanel(
textInput("input_text", "Enter a phrase:", value = "crude oil"),
actionButton("predict_btn", "Predict Next Word"),
hr(),
h4("Documentation"),
p("This app predicts the next word using an N-gram language model with backoff strategy."),
p("Visualizations include word frequency, bigrams, trigrams, word cloud, and uncertainty gauge.")
),
mainPanel(
h3("Predicted Next Words:"),
verbatimTextOutput("prediction"),
plotOutput("barplot"),
hr(),
#h4("Prediction Uncertainty (Entropy):"),
#gaugeOutput("entropy_gauge"), # Gauge visualization
h4("Prediction Uncertainty (Entropy):"),
verbatimTextOutput("entropy_text"),
hr(),
tabsetPanel(
tabPanel("Word Frequency", plotOutput("word_freq_plot")),
tabPanel("Top Bigrams", plotOutput("bigram_plot")),
tabPanel("Top Trigrams", plotOutput("trigram_plot")),
tabPanel("Word Cloud", plotOutput("wordcloud_plot"))
)
)
)
)
# ===========================
# Shiny Server
# ===========================
server <- function(input, output) {
observeEvent(input$predict_btn, {
context <- input$input_text
predictions <- predict_next_word(context, top_n = 3)
output$prediction <- renderText({
paste("Top Predictions:", paste(predictions, collapse = ", "))
})
# Bar plot for predictions
freq_data <- data.frame(word = predictions, freq = seq(length(predictions), 1))
output$barplot <- renderPlot({
ggplot(freq_data, aes(x = reorder(word, -freq), y = freq, fill = word)) +
geom_bar(stat = "identity") +
ggtitle("Top Predicted Words") +
xlab("Word") + ylab("Rank") +
theme_minimal()
})
# Entropy Gauge
entropy_val <- calculate_entropy(predictions)
output$entropy_text <- renderText({
paste("Entropy of predictions:", round(entropy_val, 3))
})
#output$entropy_gauge <- renderGauge({
# gauge(entropy_val, min = 0, max = 2, symbol = "",
# sectors = gaugeSectors(success = c(0, 0.8), warning = c(0.8, 1.5), danger = c(1.5, 2)),
# label = "Entropy")
# })
})
# Word Frequency Plot
output$word_freq_plot <- renderPlot({
ggplot(unigrams[1:20, ], aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
ggtitle("Top 20 Most Frequent Words")
})
# Bigram Plot
output$bigram_plot <- renderPlot({
ggplot(bigram_freq[1:20, ], aes(x = reorder(paste(w1, w2), n), y = n)) +
geom_bar(stat = "identity", fill = "darkgreen") +
coord_flip() +
ggtitle("Top 20 Bigrams")
})
# Trigram Plot
output$trigram_plot <- renderPlot({
ggplot(trigram_freq[1:20, ], aes(x = reorder(paste(w1, w2, w3), n), y = n)) +
geom_bar(stat = "identity", fill = "purple") +
coord_flip() +
ggtitle("Top 20 Trigrams")
})
# Word Cloud
output$wordcloud_plot <- renderPlot({
wordcloud(words = unigrams$word, freq = unigrams$n, min.freq = 2,
max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
})
}
# ===========================
# Run App
# ===========================
shinyApp(ui = ui, server = server)
}
Quantitative Metrics:
The model balances simplicity and effectiveness, ideal for real-time applications.
output$entropy_text <- renderText({
paste("Entropy of predictions:", round(entropy_val, 3))
})