Exploratory Analysis and Predictive Modeling

Introduction

The goal of this project is to explore the dataset, understand its structure, and prepare for building a predictive word model using n-grams. The dataset consists of text from blogs, news articles, and Twitter posts.

Task 2: Data Cleaning and N-Gram Analysis

Data Loading and Summary

The dataset contains three text files: en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt. Below is the data loading and summary process:

# Install necessary packages
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("wordcloud")) install.packages("wordcloud")
if (!require("data.table")) install.packages("data.table")
if (!require("tm")) install.packages("tm")
if (!require("RColorBrewer")) install.packages("RColorBrewer")
if (!require("tidytext")) install.packages("tidytext")
if (!require("tidyr")) install.packages("tidyr")
if (!require("stringr")) install.packages("stringr")
if (!require("parallel")) install.packages("parallel")
if (!require("magrittr")) install.packages("magrittr")
library(stringr)
library(wordcloud)
library(data.table)
library(tm)
library(RColorBrewer)
library(tidytext)
library(tidyr)
library(parallel)
library(magrittr) # Load magrittr for %>%

Exploratory Data Analysis

setwd("E:\\RStudio\\Coursera-SwiftKey\\final\\en_US")
# Load datasets
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

# Summary statistics
data_summary <- data.frame(
  Dataset = c("Blogs", "News", "Twitter"),
  Line_Count = c(length(blogs), length(news), length(twitter)),
  Word_Count = c(sum(str_count(blogs, "\\S+")),
                 sum(str_count(news, "\\S+")),
                 sum(str_count(twitter, "\\S+"))),
  File_Size_MB = c(file.info("en_US.blogs.txt")$size / 1e+6,
                   file.info("en_US.news.txt")$size / 1e+6,
                   file.info("en_US.twitter.txt")$size / 1e+6)
)
data_summary

##   Dataset Line_Count Word_Count File_Size_MB
## 1   Blogs     899288   37334131     210.1600
## 2    News      77259    2643969     205.8119
## 3 Twitter    2360148   30373583     167.1053

Word Count Distribution

The distribution of word counts in each dataset is visualized below.

# Calculate word counts per line
blogs_word_counts <- str_count(blogs, "\\S+")
news_word_counts <- str_count(news, "\\S+")
twitter_word_counts <- str_count(twitter, "\\S+")

# Combine into a single data frame
word_counts_df <- data.frame(
  Dataset = rep(c("Blogs", "News", "Twitter"),
                times = c(length(blogs_word_counts), length(news_word_counts), length(twitter_word_counts))),
  Word_Count = c(blogs_word_counts, news_word_counts, twitter_word_counts)
)

# Plot
ggplot(word_counts_df, aes(x = Word_Count, fill = Dataset)) +
  geom_histogram(binwidth = 5, alpha = 0.7, position = "identity") +
  facet_wrap(~Dataset, scales = "free_y") +
  labs(title = "Word Count Distribution", x = "Words per Line", y = "Frequency") +
  theme_minimal()

Frequent Words

# Combine datasets
all_text <- c(blogs, news, twitter)

# Preprocess text
clean_text <- tolower(all_text) %>%
  str_replace_all("[^a-z'\\s]", "") %>%
  str_replace_all("\\s+", " ")

# Tokenize into words
words <- unlist(str_split(clean_text, " "))
word_freq <- table(words)

# Create word cloud
wordcloud(names(word_freq), as.numeric(word_freq), max.words = 100, colors = brewer.pal(8, "Dark2"))

Bigram Analysis

# Tokenize into bigrams
bigrams <- all_text %>%
  tibble::tibble(text = .) %>%                              # Create a tibble
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

# Convert to data.table for faster processing
bigrams_dt <- as.data.table(bigrams)

# Count frequencies
bigram_counts <- bigrams_dt[, .N, by = bigram][order(-N)]

# Plot top bigrams
ggplot(bigram_counts[1:20], aes(x = reorder(bigram, N), y = N)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Bigrams", x = "Bigram", y = "Frequency") +
  theme_minimal()

Trigram Analysis

# Define Trigram Processing Function
process_trigrams <- function(text_chunk) {
  trigrams <- tibble::tibble(text = text_chunk) %>%
    unnest_tokens(trigram, text, token = "ngrams", n = 3)

  trigrams_dt <- as.data.table(trigrams)
  trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
  setnames(trigrams_split, c("word1", "word2", "word3"))

  trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, word2, word3)][order(-N)]
  return(trigram_counts_chunk)
}

# Ensure all_text is properly initialized
if (!exists("all_text")) {
  all_text <- c(blogs, news, twitter)
}

# Check if all_text is valid and not empty
if (length(all_text) == 0) {
  stop("Error: 'all_text' is empty. Please ensure data files are loaded correctly.")
}

# Divide the dataset into chunks
chunk_size <- 5000
text_chunks <- split(all_text, ceiling(seq_along(all_text) / chunk_size))

# Use parallel processing
cl <- makeCluster(detectCores() - 1)  # Leave one core free
clusterEvalQ(cl, {
  library(dplyr)
  library(tibble)
  library(tidyr)
  library(tidytext)
  library(data.table)
  library(magrittr)
  process_trigrams <- function(text_chunk) {
    trigrams <- tibble::tibble(text = text_chunk) %>%
      unnest_tokens(trigram, text, token = "ngrams", n = 3)
    trigrams_dt <- as.data.table(trigrams)
    trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
    setnames(trigrams_split, c("word1", "word2", "word3"))
    trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, word2, word3)][order(-N)]
    return(trigram_counts_chunk)
  }
})

## [[1]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[2]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[3]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[4]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[5]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[6]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[7]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[8]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[9]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[10]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[11]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[12]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[13]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[14]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }
## 
## [[15]]
## function (text_chunk) 
## {
##     trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram, 
##         text, token = "ngrams", n = 3)
##     trigrams_dt <- as.data.table(trigrams)
##     trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
##     setnames(trigrams_split, c("word1", "word2", "word3"))
##     trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, 
##         word2, word3)][order(-N)]
##     return(trigram_counts_chunk)
## }

clusterExport(cl, varlist = c("text_chunks"))
trigram_counts_list <- parLapply(cl, text_chunks, process_trigrams)
stopCluster(cl)

# Combine results from all chunks
trigram_counts <- rbindlist(trigram_counts_list)[, .(N = sum(N)), by = .(word1, word2, word3)][order(-N)]

# Plot top trigrams
ggplot(trigram_counts[1:20], aes(x = reorder(paste(word1, word2, word3, sep = " "), N), y = N)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Top 20 Trigrams", x = "Trigram", y = "Frequency") +
  theme_minimal()

Task 3: Next Word Prediction and Shiny App

Next Word Prediction

# Prediction function
predict_next_word <- function(input_text, ngram_data) {
  words <- unlist(str_split(input_text, " "))
  last_two_words <- tail(words, 2)  # Get the last two words

  # Filter trigrams for matching bigrams
  prediction <- ngram_data[word1 == last_two_words[1] & word2 == last_two_words[2]][1]

  if (!is.na(prediction$word3)) {
    return(prediction$word3)
  } else {
    return("No prediction available")
  }
}

# Example usage
predict_next_word("data science", trigram_counts)

## [1] "hackathon"

Shiny App

if (!require("shiny")) install.packages("shiny")
library(shiny)

# Define UI
ui <- fluidPage(
  titlePanel("Next Word Prediction"),
  sidebarLayout(
    sidebarPanel(
      textInput("input_text", "Enter Text:", value = "data science"),
      actionButton("predict", "Predict")
    ),
    mainPanel(
      h3("Predicted Next Word:"),
      textOutput("output_word")
    )
  )
)

# Define Server
server <- function(input, output) {
  prediction <- eventReactive(input$predict, {
    predict_next_word(input$input_text, trigram_counts)
  })

  output$output_word <- renderText({
    prediction()
  })
}

# Run the App
shinyApp(ui = ui, server = server)

Shiny applications not supported in static R Markdown documents