Text Mining Project: EDA and Prediction Plan

library(shiny)
library(ggplot2)
library(dplyr)
library(tidytext)
library(tibble)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(flexdashboard)  # For gauge

# ===========================
# Load and Prepare Data
# ===========================
data("crude")
corpus <- VCorpus(VectorSource(crude))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)

text_df <- tibble(text = sapply(corpus, as.character))

# Unigrams, Bigrams, Trigrams
unigrams <- text_df %>% unnest_tokens(word, text, token = "words") %>% count(word, sort = TRUE)
bigrams <- text_df %>% unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% count(bigram, sort = TRUE)
trigrams <- text_df %>% unnest_tokens(trigram, text, token = "ngrams", n = 3) %>% count(trigram, sort = TRUE)

bigram_freq <- bigrams %>% tidyr::separate(bigram, into = c("w1", "w2"), sep = " ")
trigram_freq <- trigrams %>% tidyr::separate(trigram, into = c("w1", "w2", "w3"), sep = " ")

# ===========================
# Prediction Function (Backoff)
# ===========================
predict_next_word <- function(input_text, top_n = 3) {
  input_text <- tolower(input_text)
  words <- strsplit(input_text, "\\s+")[[1]]
  len <- length(words)
  
  # Try trigram
  if (len >= 2) {
    w1 <- words[len - 1]
    w2 <- words[len]
    trigram_matches <- trigram_freq %>%
      filter(w1 == !!w1, w2 == !!w2) %>%
      arrange(desc(n)) %>%
      head(top_n)
    if (nrow(trigram_matches) > 0) return(trigram_matches$w3)
  }
  
  # Backoff to bigram
  if (len >= 1) {
    w2 <- words[len]
    bigram_matches <- bigram_freq %>%
      filter(w1 == !!w2) %>%
      arrange(desc(n)) %>%
      head(top_n)
    if (nrow(bigram_matches) > 0) return(bigram_matches$w2)
  }
  
  # Backoff to unigram
  unigram_matches <- unigrams %>%
    arrange(desc(n)) %>%
    head(top_n)
  return(unigram_matches$word)
}

# ===========================
# Entropy Calculation
# ===========================
calculate_entropy <- function(predictions) {
  probs <- rep(1 / length(predictions), length(predictions)) # uniform since no probabilities
  entropy <- -sum(probs * log2(probs))
  return(entropy)
}

# ===========================
# Shiny UI
# ===========================
ui <- fluidPage(
  titlePanel("Next Word Predictor (N-gram Model)"),
  
  sidebarLayout(
    sidebarPanel(
      textInput("input_text", "Enter a phrase:", value = "crude oil"),
      actionButton("predict_btn", "Predict Next Word"),
      hr(),
      h4("Documentation"),
      p("This app predicts the next word using an N-gram language model with backoff strategy."),
      p("Visualizations include word frequency, bigrams, trigrams, word cloud, and uncertainty gauge.")
    ),
    
    mainPanel(
      h3("Predicted Next Words:"),
      verbatimTextOutput("prediction"),
      plotOutput("barplot"),
      hr(),
      #h4("Prediction Uncertainty (Entropy):"),
      #gaugeOutput("entropy_gauge"),  # Gauge visualization
      h4("Prediction Uncertainty (Entropy):"),
      verbatimTextOutput("entropy_text"),
      hr(),
      tabsetPanel(
        tabPanel("Word Frequency", plotOutput("word_freq_plot")),
        tabPanel("Top Bigrams", plotOutput("bigram_plot")),
        tabPanel("Top Trigrams", plotOutput("trigram_plot")),
        tabPanel("Word Cloud", plotOutput("wordcloud_plot"))
      )
    )
  )
)

# ===========================
# Shiny Server
# ===========================
server <- function(input, output) {
  observeEvent(input$predict_btn, {
    context <- input$input_text
    predictions <- predict_next_word(context, top_n = 3)
    
    output$prediction <- renderText({
      paste("Top Predictions:", paste(predictions, collapse = ", "))
    })
    
    # Bar plot for predictions
    freq_data <- data.frame(word = predictions, freq = seq(length(predictions), 1))
    output$barplot <- renderPlot({
      ggplot(freq_data, aes(x = reorder(word, -freq), y = freq, fill = word)) +
        geom_bar(stat = "identity") +
        ggtitle("Top Predicted Words") +
        xlab("Word") + ylab("Rank") +
        theme_minimal()
    })
    
    # Entropy Gauge
    entropy_val <- calculate_entropy(predictions)
    
    output$entropy_text <- renderText({
      paste("Entropy of predictions:", round(entropy_val, 3))
    })

    #output$entropy_gauge <- renderGauge({
    #  gauge(entropy_val, min = 0, max = 2, symbol = "",
    #        sectors = gaugeSectors(success = c(0, 0.8), warning = c(0.8, 1.5), danger = c(1.5, 2)),
    #        label = "Entropy")
   # })
  })
  
  # Word Frequency Plot
  output$word_freq_plot <- renderPlot({
    ggplot(unigrams[1:20, ], aes(x = reorder(word, n), y = n)) +
      geom_bar(stat = "identity", fill = "steelblue") +
      coord_flip() +
      ggtitle("Top 20 Most Frequent Words")
  })
  
  # Bigram Plot
  output$bigram_plot <- renderPlot({
    ggplot(bigram_freq[1:20, ], aes(x = reorder(paste(w1, w2), n), y = n)) +
      geom_bar(stat = "identity", fill = "darkgreen") +
      coord_flip() +
      ggtitle("Top 20 Bigrams")
  })
  
  # Trigram Plot
  output$trigram_plot <- renderPlot({
    ggplot(trigram_freq[1:20, ], aes(x = reorder(paste(w1, w2, w3), n), y = n)) +
      geom_bar(stat = "identity", fill = "purple") +
      coord_flip() +
      ggtitle("Top 20 Trigrams")
  })
  
  # Word Cloud
  output$wordcloud_plot <- renderPlot({
    wordcloud(words = unigrams$word, freq = unigrams$n, min.freq = 2,
              max.words = 100, random.order = FALSE, colors = brewer.pal(8, "Dark2"))
  })
}

# ===========================
# Run App
# ===========================
shinyApp(ui = ui, server = server)
}
Slide 1: What is the Product?

Slide 2: How the Model Works – Overview

Slide 3: Model App

Slide 4: Predictive Performance

Slide 5: App Demo – How It Works

Slide 6: Why It’s Awesome

> A lightweight, educational, and practical NLP tool for predictive text modeling.