The goal of this project is to explore the dataset, understand its structure, and prepare for building a predictive word model using n-grams. The dataset consists of text from blogs, news articles, and Twitter posts.
The dataset contains three text files: en_US.blogs.txt,
en_US.news.txt, and en_US.twitter.txt. Below
is the data loading and summary process:
# Install necessary packages
if (!require("dplyr")) install.packages("dplyr")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("wordcloud")) install.packages("wordcloud")
if (!require("data.table")) install.packages("data.table")
if (!require("tm")) install.packages("tm")
if (!require("RColorBrewer")) install.packages("RColorBrewer")
if (!require("tidytext")) install.packages("tidytext")
if (!require("tidyr")) install.packages("tidyr")
if (!require("stringr")) install.packages("stringr")
if (!require("parallel")) install.packages("parallel")
if (!require("magrittr")) install.packages("magrittr")
library(stringr)
library(wordcloud)
library(data.table)
library(tm)
library(RColorBrewer)
library(tidytext)
library(tidyr)
library(parallel)
library(magrittr) # Load magrittr for %>%
setwd("E:\\RStudio\\Coursera-SwiftKey\\final\\en_US")
# Load datasets
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
# Summary statistics
data_summary <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Line_Count = c(length(blogs), length(news), length(twitter)),
Word_Count = c(sum(str_count(blogs, "\\S+")),
sum(str_count(news, "\\S+")),
sum(str_count(twitter, "\\S+"))),
File_Size_MB = c(file.info("en_US.blogs.txt")$size / 1e+6,
file.info("en_US.news.txt")$size / 1e+6,
file.info("en_US.twitter.txt")$size / 1e+6)
)
data_summary
## Dataset Line_Count Word_Count File_Size_MB
## 1 Blogs 899288 37334131 210.1600
## 2 News 77259 2643969 205.8119
## 3 Twitter 2360148 30373583 167.1053
The distribution of word counts in each dataset is visualized below.
# Calculate word counts per line
blogs_word_counts <- str_count(blogs, "\\S+")
news_word_counts <- str_count(news, "\\S+")
twitter_word_counts <- str_count(twitter, "\\S+")
# Combine into a single data frame
word_counts_df <- data.frame(
Dataset = rep(c("Blogs", "News", "Twitter"),
times = c(length(blogs_word_counts), length(news_word_counts), length(twitter_word_counts))),
Word_Count = c(blogs_word_counts, news_word_counts, twitter_word_counts)
)
# Plot
ggplot(word_counts_df, aes(x = Word_Count, fill = Dataset)) +
geom_histogram(binwidth = 5, alpha = 0.7, position = "identity") +
facet_wrap(~Dataset, scales = "free_y") +
labs(title = "Word Count Distribution", x = "Words per Line", y = "Frequency") +
theme_minimal()
# Combine datasets
all_text <- c(blogs, news, twitter)
# Preprocess text
clean_text <- tolower(all_text) %>%
str_replace_all("[^a-z'\\s]", "") %>%
str_replace_all("\\s+", " ")
# Tokenize into words
words <- unlist(str_split(clean_text, " "))
word_freq <- table(words)
# Create word cloud
wordcloud(names(word_freq), as.numeric(word_freq), max.words = 100, colors = brewer.pal(8, "Dark2"))
# Tokenize into bigrams
bigrams <- all_text %>%
tibble::tibble(text = .) %>% # Create a tibble
unnest_tokens(bigram, text, token = "ngrams", n = 2)
# Convert to data.table for faster processing
bigrams_dt <- as.data.table(bigrams)
# Count frequencies
bigram_counts <- bigrams_dt[, .N, by = bigram][order(-N)]
# Plot top bigrams
ggplot(bigram_counts[1:20], aes(x = reorder(bigram, N), y = N)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Bigrams", x = "Bigram", y = "Frequency") +
theme_minimal()
# Define Trigram Processing Function
process_trigrams <- function(text_chunk) {
trigrams <- tibble::tibble(text = text_chunk) %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigrams_dt <- as.data.table(trigrams)
trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
setnames(trigrams_split, c("word1", "word2", "word3"))
trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, word2, word3)][order(-N)]
return(trigram_counts_chunk)
}
# Ensure all_text is properly initialized
if (!exists("all_text")) {
all_text <- c(blogs, news, twitter)
}
# Check if all_text is valid and not empty
if (length(all_text) == 0) {
stop("Error: 'all_text' is empty. Please ensure data files are loaded correctly.")
}
# Divide the dataset into chunks
chunk_size <- 5000
text_chunks <- split(all_text, ceiling(seq_along(all_text) / chunk_size))
# Use parallel processing
cl <- makeCluster(detectCores() - 1) # Leave one core free
clusterEvalQ(cl, {
library(dplyr)
library(tibble)
library(tidyr)
library(tidytext)
library(data.table)
library(magrittr)
process_trigrams <- function(text_chunk) {
trigrams <- tibble::tibble(text = text_chunk) %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigrams_dt <- as.data.table(trigrams)
trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
setnames(trigrams_split, c("word1", "word2", "word3"))
trigram_counts_chunk <- trigrams_split[, .N, by = .(word1, word2, word3)][order(-N)]
return(trigram_counts_chunk)
}
})
## [[1]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[2]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[3]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[4]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[5]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[6]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[7]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[8]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[9]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[10]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[11]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[12]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[13]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[14]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
##
## [[15]]
## function (text_chunk)
## {
## trigrams <- tibble::tibble(text = text_chunk) %>% unnest_tokens(trigram,
## text, token = "ngrams", n = 3)
## trigrams_dt <- as.data.table(trigrams)
## trigrams_split <- trigrams_dt[, tstrsplit(trigram, " ", fixed = TRUE)]
## setnames(trigrams_split, c("word1", "word2", "word3"))
## trigram_counts_chunk <- trigrams_split[, .N, by = .(word1,
## word2, word3)][order(-N)]
## return(trigram_counts_chunk)
## }
clusterExport(cl, varlist = c("text_chunks"))
trigram_counts_list <- parLapply(cl, text_chunks, process_trigrams)
stopCluster(cl)
# Combine results from all chunks
trigram_counts <- rbindlist(trigram_counts_list)[, .(N = sum(N)), by = .(word1, word2, word3)][order(-N)]
# Plot top trigrams
ggplot(trigram_counts[1:20], aes(x = reorder(paste(word1, word2, word3, sep = " "), N), y = N)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Top 20 Trigrams", x = "Trigram", y = "Frequency") +
theme_minimal()
# Prediction function
predict_next_word <- function(input_text, ngram_data) {
words <- unlist(str_split(input_text, " "))
last_two_words <- tail(words, 2) # Get the last two words
# Filter trigrams for matching bigrams
prediction <- ngram_data[word1 == last_two_words[1] & word2 == last_two_words[2]][1]
if (!is.na(prediction$word3)) {
return(prediction$word3)
} else {
return("No prediction available")
}
}
# Example usage
predict_next_word("data science", trigram_counts)
## [1] "hackathon"
if (!require("shiny")) install.packages("shiny")
library(shiny)
# Define UI
ui <- fluidPage(
titlePanel("Next Word Prediction"),
sidebarLayout(
sidebarPanel(
textInput("input_text", "Enter Text:", value = "data science"),
actionButton("predict", "Predict")
),
mainPanel(
h3("Predicted Next Word:"),
textOutput("output_word")
)
)
)
# Define Server
server <- function(input, output) {
prediction <- eventReactive(input$predict, {
predict_next_word(input$input_text, trigram_counts)
})
output$output_word <- renderText({
prediction()
})
}
# Run the App
shinyApp(ui = ui, server = server)