This is the final report for the capstone project within the Data Science Specialization Program from John Hopkins University.
The data provided can be downloaded here: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
# Bring in data - Locally Saved
# Load Data
fileBlog <- readLines("en_US.blogs.txt", encoding = "UTF-8")
fileNews <- readLines("en_US.news.txt", encoding = "UTF-8")
fileTwitter <- readLines("en_US.twitter.txt", encoding = "UTF-8")
There are now 3 large character vectors.
#Creating function to auto clean
clean_data <- function(data) {
data <- trimws(data) # Remove leading/trailing whitespace
data <- unique(data) # Remove duplicates
return(data[nzchar(data)]) # Remove empty strings
}
fileBlog <- clean_data(fileBlog)
fileNews <- clean_data(fileNews)
fileTwitter <- clean_data(fileTwitter)
Due to memory issues, had to swap to data.table and try to be more efficient. Then everything worked but couldn’t KNIT so swapped to chunk processing
process_ngrams_in_chunks <- function(data, n, chunk_size = 10000) {
num_chunks <- ceiling(length(data) / chunk_size) # Calculate total chunks
freq_table <- data.table(Phrase = character(), Frequency = integer()) # Initialize empty table
for (i in seq_len(num_chunks)) {
# Get the current chunk
chunk_start <- (i - 1) * chunk_size + 1
chunk_end <- min(i * chunk_size, length(data))
chunk <- data[chunk_start:chunk_end]
# Generate n-grams for the chunk
ngrams <- tokenize_ngrams(chunk, n = n)
# Create frequency table for the chunk
chunk_table <- data.table(Phrase = unlist(ngrams))
chunk_table <- chunk_table[, .N, by = Phrase]
setnames(chunk_table, "N", "Frequency")
# Merge with the main frequency table
freq_table <- rbindlist(list(freq_table, chunk_table))
freq_table <- freq_table[, .(Frequency = sum(Frequency)), by = Phrase]
}
# Order by frequency
setorder(freq_table, -Frequency)
return(freq_table)
}
biGramBlog_table <- process_ngrams_in_chunks(fileBlog, 2, chunk_size = 100000)
triGramBlog_table <- process_ngrams_in_chunks(fileBlog, 3, chunk_size = 100000)
biGramNews_table <- process_ngrams_in_chunks(fileNews, 2, chunk_size = 100000)
triGramNews_table <- process_ngrams_in_chunks(fileNews, 3, chunk_size = 100000)
biGramTwitter_table <- process_ngrams_in_chunks(fileTwitter, 2, chunk_size = 100000)
triGramTwitter_table <- process_ngrams_in_chunks(fileTwitter, 3, chunk_size = 100000)
# Free memory
rm(fileBlog, fileNews, fileTwitter)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 44123392 2356.5 80005505 4272.8 48595323 2595.3
## Vcells 234688905 1790.6 590853339 4507.9 590853339 4507.9
Now we combine the tables so we can use all the data.
combine_tables <- function(...) {
combined_table <- rbindlist(list(...))
combined_table <- combined_table[, .(Frequency = sum(Frequency)), by = Phrase]
setorder(combined_table, -Frequency)
return(combined_table)
}
combined_bigram_table <- combine_tables(biGramBlog_table, biGramNews_table, biGramTwitter_table)
combined_trigram_table <- combine_tables(triGramBlog_table, triGramNews_table, triGramTwitter_table)
Now let’s write the function to use these tables to actually predict the next word.
predict_next_word <- function(input_phrase, bigram_table, trigram_table = NULL) {
input_phrase <- tolower(trimws(input_phrase)) # Clean input
words <- unlist(strsplit(input_phrase, "\\s+"))
n <- length(words)
# Use trigrams if input has 2+ words
if (n >= 2 && !is.null(trigram_table)) {
last_two_words <- paste(words[(n - 1):n], collapse = " ")
matches <- trigram_table[Phrase %like% paste0("^", last_two_words, " ")]
if (nrow(matches) > 0) return(matches$Phrase[1])
}
# Use bigrams if input has 1+ words
if (n >= 1) {
last_word <- words[n]
matches <- bigram_table[Phrase %like% paste0("^", last_word, " ")]
if (nrow(matches) > 0) return(matches$Phrase[1])
}
return("No prediction available")
}
Saving frequency tables for Shiny App
saveRDS(combined_bigram_table, "combined_bigram_table.rds")
saveRDS(combined_trigram_table, "combined_trigram_table.rds")
Time to test if it works!
predicted_word <- predict_next_word("and I'd", combined_bigram_table, combined_trigram_table)
print(predicted_word)
## [1] "and i'd like"
It didn’t work, but it does now ;)
plot_top_ngrams <- function(ngram_table, title, top_n = 10) {
top_ngrams <- head(ngram_table, top_n)
ggplot(top_ngrams, aes(x = reorder(Phrase, Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = title, x = "N-grams", y = "Frequency") +
theme_minimal()
}
plot_top_ngrams(combined_bigram_table, "Top 10 Combined Bigrams")