Final Report

This is the final report for the capstone project within the Data Science Specialization Program from John Hopkins University.

Understanding the problem

  • We want to be able to predict a word given 1, 2, or 3 words using a basic n-gram model.
  • An n-gram model is trained on a corpus (large body of text).
  • The model counts how often different sequences of n-words occur in the corpus.
  • It then estimates the probability of the word given the previous words using n-1.
  • The next word is selected by the highest probability.

Data Acquisition

The data provided can be downloaded here: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

# Bring in data - Locally Saved
# Load Data
fileBlog <- readLines("en_US.blogs.txt", encoding = "UTF-8")
fileNews <- readLines("en_US.news.txt", encoding = "UTF-8")
fileTwitter <- readLines("en_US.twitter.txt", encoding = "UTF-8")

There are now 3 large character vectors.

Clean and prepare data

#Creating function to auto clean
clean_data <- function(data) {
  data <- trimws(data)      # Remove leading/trailing whitespace
  data <- unique(data)      # Remove duplicates
  return(data[nzchar(data)])  # Remove empty strings
}

fileBlog <- clean_data(fileBlog)
fileNews <- clean_data(fileNews)
fileTwitter <- clean_data(fileTwitter)

Due to memory issues, had to swap to data.table and try to be more efficient. Then everything worked but couldn’t KNIT so swapped to chunk processing

process_ngrams_in_chunks <- function(data, n, chunk_size = 10000) {
  num_chunks <- ceiling(length(data) / chunk_size)  # Calculate total chunks
  freq_table <- data.table(Phrase = character(), Frequency = integer())  # Initialize empty table

  for (i in seq_len(num_chunks)) {
    # Get the current chunk
    chunk_start <- (i - 1) * chunk_size + 1
    chunk_end <- min(i * chunk_size, length(data))
    chunk <- data[chunk_start:chunk_end]

    # Generate n-grams for the chunk
    ngrams <- tokenize_ngrams(chunk, n = n)

    # Create frequency table for the chunk
    chunk_table <- data.table(Phrase = unlist(ngrams))
    chunk_table <- chunk_table[, .N, by = Phrase]
    setnames(chunk_table, "N", "Frequency")

    # Merge with the main frequency table
    freq_table <- rbindlist(list(freq_table, chunk_table))
    freq_table <- freq_table[, .(Frequency = sum(Frequency)), by = Phrase]
  }

  # Order by frequency
  setorder(freq_table, -Frequency)
  return(freq_table)
}

biGramBlog_table <- process_ngrams_in_chunks(fileBlog, 2, chunk_size = 100000)
triGramBlog_table <- process_ngrams_in_chunks(fileBlog, 3, chunk_size = 100000)

biGramNews_table <- process_ngrams_in_chunks(fileNews, 2, chunk_size = 100000)
triGramNews_table <- process_ngrams_in_chunks(fileNews, 3, chunk_size = 100000)

biGramTwitter_table <- process_ngrams_in_chunks(fileTwitter, 2, chunk_size = 100000)
triGramTwitter_table <- process_ngrams_in_chunks(fileTwitter, 3, chunk_size = 100000)

# Free memory
rm(fileBlog, fileNews, fileTwitter)
gc()
##             used   (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  44123392 2356.5   80005505 4272.8  48595323 2595.3
## Vcells 234688905 1790.6  590853339 4507.9 590853339 4507.9

Now we combine the tables so we can use all the data.

Combine frequency tables

  combine_tables <- function(...) {
  combined_table <- rbindlist(list(...))
  combined_table <- combined_table[, .(Frequency = sum(Frequency)), by = Phrase]
  setorder(combined_table, -Frequency)
  return(combined_table)
}

combined_bigram_table <- combine_tables(biGramBlog_table, biGramNews_table, biGramTwitter_table)
combined_trigram_table <- combine_tables(triGramBlog_table, triGramNews_table, triGramTwitter_table)

Now let’s write the function to use these tables to actually predict the next word.

Predict next word function

predict_next_word <- function(input_phrase, bigram_table, trigram_table = NULL) {
  input_phrase <- tolower(trimws(input_phrase))  # Clean input
  words <- unlist(strsplit(input_phrase, "\\s+"))
  n <- length(words)

  # Use trigrams if input has 2+ words
  if (n >= 2 && !is.null(trigram_table)) {
    last_two_words <- paste(words[(n - 1):n], collapse = " ")
    matches <- trigram_table[Phrase %like% paste0("^", last_two_words, " ")]
    if (nrow(matches) > 0) return(matches$Phrase[1])
  }

  # Use bigrams if input has 1+ words
  if (n >= 1) {
    last_word <- words[n]
    matches <- bigram_table[Phrase %like% paste0("^", last_word, " ")]
    if (nrow(matches) > 0) return(matches$Phrase[1])
  }

  return("No prediction available")
}

Saving frequency tables for Shiny App

saveRDS(combined_bigram_table, "combined_bigram_table.rds")
saveRDS(combined_trigram_table, "combined_trigram_table.rds")

Time to test if it works!

Test predictions

predicted_word <- predict_next_word("and I'd", combined_bigram_table, combined_trigram_table)
print(predicted_word)
## [1] "and i'd like"

It didn’t work, but it does now ;)

Visualizations

plot_top_ngrams <- function(ngram_table, title, top_n = 10) {
  top_ngrams <- head(ngram_table, top_n)
  ggplot(top_ngrams, aes(x = reorder(Phrase, Frequency), y = Frequency)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = title, x = "N-grams", y = "Frequency") +
    theme_minimal()
}

plot_top_ngrams(combined_bigram_table, "Top 10 Combined Bigrams")