Final Report

This is the final report for the capstone project within the Data Science Specialization Program from John Hopkins University.

Understanding the problem

We want to be able to predict a word given 1, 2, or 3 words using a basic n-gram model.
An n-gram model is trained on a corpus (large body of text).
The model counts how often different sequences of n-words occur in the corpus.
It then estimates the probability of the word given the previous words using n-1.
The next word is selected by the highest probability.

Data Acquisition

The data provided can be downloaded here: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

# Bring in data - Locally Saved
# Load Data
fileBlog <- readLines("en_US.blogs.txt", encoding = "UTF-8")
fileNews <- readLines("en_US.news.txt", encoding = "UTF-8")
fileTwitter <- readLines("en_US.twitter.txt", encoding = "UTF-8")

There are now 3 large character vectors.

Clean and prepare data

#Creating function to auto clean
clean_data <- function(data) {
  data <- trimws(data)      # Remove leading/trailing whitespace
  data <- unique(data)      # Remove duplicates
  return(data[nzchar(data)])  # Remove empty strings
}

fileBlog <- clean_data(fileBlog)
fileNews <- clean_data(fileNews)
fileTwitter <- clean_data(fileTwitter)

Due to memory issues, had to swap to data.table and try to be more efficient. Then everything worked but couldn’t KNIT so swapped to chunk processing

process_ngrams_in_chunks <- function(data, n, chunk_size = 10000) {
  num_chunks <- ceiling(length(data) / chunk_size)  # Calculate total chunks
  freq_table <- data.table(Phrase = character(), Frequency = integer())  # Initialize empty table

  for (i in seq_len(num_chunks)) {
    # Get the current chunk
    chunk_start <- (i - 1) * chunk_size + 1
    chunk_end <- min(i * chunk_size, length(data))
    chunk <- data[chunk_start:chunk_end]

    # Generate n-grams for the chunk
    ngrams <- tokenize_ngrams(chunk, n = n)

    # Create frequency table for the chunk
    chunk_table <- data.table(Phrase = unlist(ngrams))
    chunk_table <- chunk_table[, .N, by = Phrase]
    setnames(chunk_table, "N", "Frequency")

    # Merge with the main frequency table
    freq_table <- rbindlist(list(freq_table, chunk_table))
    freq_table <- freq_table[, .(Frequency = sum(Frequency)), by = Phrase]
  }

  # Order by frequency
  setorder(freq_table, -Frequency)
  return(freq_table)
}

biGramBlog_table <- process_ngrams_in_chunks(fileBlog, 2, chunk_size = 100000)
triGramBlog_table <- process_ngrams_in_chunks(fileBlog, 3, chunk_size = 100000)

biGramNews_table <- process_ngrams_in_chunks(fileNews, 2, chunk_size = 100000)
triGramNews_table <- process_ngrams_in_chunks(fileNews, 3, chunk_size = 100000)

biGramTwitter_table <- process_ngrams_in_chunks(fileTwitter, 2, chunk_size = 100000)
triGramTwitter_table <- process_ngrams_in_chunks(fileTwitter, 3, chunk_size = 100000)

# Free memory
rm(fileBlog, fileNews, fileTwitter)
gc()

##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  44123392 2356.5   80005505 4272.8  48595323 2595.3
## Vcells 234688905 1790.6  590853339 4507.9 590853339 4507.9

Now we combine the tables so we can use all the data.

Combine frequency tables

  combine_tables <- function(...) {
  combined_table <- rbindlist(list(...))
  combined_table <- combined_table[, .(Frequency = sum(Frequency)), by = Phrase]
  setorder(combined_table, -Frequency)
  return(combined_table)
}

combined_bigram_table <- combine_tables(biGramBlog_table, biGramNews_table, biGramTwitter_table)
combined_trigram_table <- combine_tables(triGramBlog_table, triGramNews_table, triGramTwitter_table)

Now let’s write the function to use these tables to actually predict the next word.

Predict next word function

predict_next_word <- function(input_phrase, bigram_table, trigram_table = NULL) {
  input_phrase <- tolower(trimws(input_phrase))  # Clean input
  words <- unlist(strsplit(input_phrase, "\\s+"))
  n <- length(words)

  # Use trigrams if input has 2+ words
  if (n >= 2 && !is.null(trigram_table)) {
    last_two_words <- paste(words[(n - 1):n], collapse = " ")
    matches <- trigram_table[Phrase %like% paste0("^", last_two_words, " ")]
    if (nrow(matches) > 0) return(matches$Phrase[1])
  }

  # Use bigrams if input has 1+ words
  if (n >= 1) {
    last_word <- words[n]
    matches <- bigram_table[Phrase %like% paste0("^", last_word, " ")]
    if (nrow(matches) > 0) return(matches$Phrase[1])
  }

  return("No prediction available")
}

Saving frequency tables for Shiny App

saveRDS(combined_bigram_table, "combined_bigram_table.rds")
saveRDS(combined_trigram_table, "combined_trigram_table.rds")

Time to test if it works!

Test predictions

predicted_word <- predict_next_word("and I'd", combined_bigram_table, combined_trigram_table)
print(predicted_word)

## [1] "and i'd like"

It didn’t work, but it does now ;)

Visualizations

plot_top_ngrams <- function(ngram_table, title, top_n = 10) {
  top_ngrams <- head(ngram_table, top_n)
  ggplot(top_ngrams, aes(x = reorder(Phrase, Frequency), y = Frequency)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = title, x = "N-grams", y = "Frequency") +
    theme_minimal()
}

plot_top_ngrams(combined_bigram_table, "Top 10 Combined Bigrams")

Final Project

Joseph Bloomquist

2025-01-25