Predicting Next Words Using N-Gram Models

Author

Siyang Ni

Data Info

First let’s get basic info of each test files we are working with:

library(pacman)
p_load(wordpredictor)
p_load(tidyverse)

tweets_path <- ("C:/Users/siyan/Downloads/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
blogs_path <- ("C:/Users/siyan/Downloads/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
news_path <- ("C:/Users/siyan/Downloads/Coursera-SwiftKey/final/en_US/en_US.news.txt")
banned_words_file <- c("C:/Users/siyan/Downloads/Coursera-SwiftKey/final/en_US/badwords.txt")


# Load the data
twitter <- read_lines(tweets_path)
Warning: One or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)
blogs <- read_lines(blogs_path)
news <- read_lines(news_path)
badwords <- read_lines(banned_words_file)

## Combine into one object
data <- c(twitter, blogs, news)
# Get file info
blogs_size = utils:::format.object_size(file.size(blogs_path), "auto")
news_size = utils:::format.object_size(file.size(news_path), "auto")
twitter_size = utils:::format.object_size(file.size(tweets_path), "auto")

blogs_lines = length(blogs)
news_lines = length(news)
twitter_lines = length(twitter)

blogs_char = nchar(blogs)
news_char = nchar(news)
twitter_char = nchar(twitter)
blog_longest = max(blogs_char)
news_longest = max(news_char)
twitter_longest = max(twitter_char)
blogsChar = sum(blogs_char)
newsChar = sum(news_char)
twitterChar = sum(twitter_char)

File = c("Blogs", "News", "Twitter")
Size = c(blogs_size, news_size, twitter_size)
Lines_number = c(blogs_lines, news_lines, twitter_lines)
Longest_line_length = c(blog_longest, news_longest, twitter_longest) 
Characters = c(blogsChar, newsChar, twitterChar)

data_summary = data.frame(cbind(File, Size, Lines_number, Longest_line_length, Characters))
data_summary
     File     Size Lines_number Longest_line_length Characters
1   Blogs 200.4 Mb       899288               40833  206824505
2    News 196.3 Mb      1010242               11384  203223159
3 Twitter 159.4 Mb      2360148                 140  162096031

Data Sampling

The dataset is fairly large for a course project. For computational efficiency, we randomly sample 10% of the data from each of the three files.

# Set a seed for reproducibility
set.seed(666)

# Calculate 10% of each dataset's length
sample_size_tweets <- round(length(twitter) * 0.10)
sample_size_blogs <- round(length(blogs) * 0.10)
sample_size_news <- round(length(news) * 0.10)

# Randomly sample 10% of each dataset
sampled_tweets <- sample(twitter, size = sample_size_tweets, replace = FALSE)
sampled_blogs <- sample(blogs, size = sample_size_blogs, replace = FALSE)
sampled_news <- sample(news, size = sample_size_news, replace = FALSE)

# Combine the sampled datasets into one object
sampled_data <- c(sampled_tweets, sampled_blogs, sampled_news)

# Check the result
length(sampled_data)
[1] 426968
# Show a few lines
head(sampled_data)
[1] "\"Happy Mother's Day\""                                                                                                   
[2] "Love can't always go the way you want it to."                                                                             
[3] "merry xmas everyone"                                                                                                      
[4] "I just have the best friends EVER"                                                                                        
[5] "When your work speaks for itself, don't interrupt. ~ Henry J. Kaiser"                                                     
[6] "Looking for Great Holiday Program Staff in NY, San Fran, Orlando, Miami, Atlanta, Seattle, Boston and Chicago! email ASAP"

We further split the sampled data into 80% training set, 10% validation set, and 10% reserved test set.

# Set a seed for reproducibility
set.seed(666)

# Shuffle the data
sampled_data <- sample(sampled_data)

# Calculate indices for splitting
total_length <- length(sampled_data)
train_end <- round(total_length * 0.80)
valid_end <- train_end + round(total_length * 0.10)

# Split into training, validation, and testing
train_data <- sampled_data[1:train_end]
valid_data <- sampled_data[(train_end + 1):valid_end]
test_data <- sampled_data[(valid_end + 1):total_length]

# Check the lengths of each split
length(train_data)  # Should be around 80% of the original length
[1] 341574
length(valid_data)  # Should be around 10% of the original length
[1] 42697
length(test_data)   # Should be around 10% of the original length
[1] 42697

Clean all three sets in the same way

p_load(tm)
p_load(stringr)


clean_text_data <- function(input_data, banned_words) {
  # Ensure inputs are character vectors
  if (!is.character(input_data)) {
    stop("input_data must be a character vector containing the text data.")
  }
  if (!is.character(banned_words)) {
    stop("banned_words must be a character vector containing the words to remove.")
  }

  # Create a text corpus from input data
  corpus <- Corpus(VectorSource(input_data))

  # Custom content transformer that combines multiple steps
  clean_corpus <- content_transformer(function(text) {
    text <- iconv(text, to = "UTF-8", sub = "byte")
    text <- tolower(text)
    text <- gsub("https?://\\S+", " ", text)
    text <- removePunctuation(text, 
                              preserve_intra_word_dashes = TRUE, 
                              preserve_intra_word_contractions=TRUE)
    text <- removeNumbers(text)
    text <- stripWhitespace(text)
    text <- removeWords(text, stopwords("english"))
    text <- removeWords(text, banned_words)
    text <- gsub("\\b[a-z]\\b", " ", text)
    text <- gsub("[[:space:]]+", " ", text)
    text <- gsub("\\W+", " ", text) # Replace non-words with space
    text <- gsub("\\b\\w{1,2}\\b", "", text) # Remove short words
    text <- gsub('"', '', text)
    text <- gsub("^\\s+|\\s+$", "", text)  # Trim whitespace
    text <- stemDocument(text)
    return(text)
  })
  # Apply the custom transformation to the corpus
  corpus <- tm_map(corpus, clean_corpus)

  # Convert corpus to character vector
  cleaned_text <- sapply(corpus, as.character)

  return(cleaned_text)
}


# Clean the data
train_data_clean <- clean_text_data(train_data, badwords)
Warning in tm_map.SimpleCorpus(corpus, clean_corpus): transformation drops
documents
valid_data_clean <- clean_text_data(valid_data, badwords)
Warning in tm_map.SimpleCorpus(corpus, clean_corpus): transformation drops
documents
test_data_clean <- clean_text_data(test_data, badwords)
Warning in tm_map.SimpleCorpus(corpus, clean_corpus): transformation drops
documents
# Chekc Cleaned data
head(train_data_clean)
[1] "suntrup sack four time threw one intercept"                                                                                  
[2] "love short link awesm"                                                                                                       
[3] "continu later second quarter fumbl thoma morstead yard punt giant yard line crisi avert amukamara pounc ball giant yard line"
[4] "can follow plz"                                                                                                              
[5] "nope easi prove know fact small talk prove critic thinker"                                                                   
[6] "record session may stop lovin music lilli tonight come get tast"                                                             
head(valid_data_clean)
[1] "even know ugli life easier"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
[2] "back englemann talk masteri spiral frame classroom environ say danger teacher focus student master part know subject lipe book spoke statist say heard mom long ago usa parent concern teacher care student china concern teacher know master teach return englemann paper read critic school teacher give student test result poor alway put blame student tri test learn disord claim student learn taught sharpli point fact find provoc true student fail teacher fail talk teacher present lesson sweden state easi teach masteri teacher just wast opportun deliv great lesson ask question"
[3] "hanley"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
[4] "good morn shopper merchant breakfast"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
[5] "klinenberg touch deep intang emot issu share regardless live find partner write enough solv social pain loneli fundament part human experi also note live alon choic circumst poor ill mani case elder solitud can crush pain burden bring none posit aspect singl life"                                                                                                                                                                                                                                                                                                                          
[6] "find beepur check walmart local honey"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
head(test_data_clean)
[1] "moon"                                                                                                                                                                          
[2] "serious high yearbook"                                                                                                                                                         
[3] "interest thing time complet contradict fact live pari made individu differ anyon els continu argentina buy shirt shoe trouser everybodi els pari flea market buy dress one els"
[4] "man never get old"                                                                                                                                                             
[5] "will now ship finland will open"                                                                                                                                               
[6] "hold back think"                                                                                                                                                               

Tokenize all sets in preparation for ngrams

# Load required packages
p_load(quanteda)
p_load(quanteda.textstats)

generate_ngrams <- function(Corpus, N) {
  tokens <- tokens(Corpus)
  ngrams <- tokens_ngrams(tokens, n = N, concatenator = " ", skip = 0)
  # Filter out n-grams with repeated words
  ngrams <- tokens_select(ngrams, pattern = "^(?!.*(\\b\\w+\\b).*\\1).*$", valuetype = "regex", case_insensitive = TRUE)
  
  dfm <- dfm(ngrams)
  return(dfm)
}

# Generate unigrams, bigrams, and trigrams for each dataset
train_unigrams <- generate_ngrams(train_data_clean, 1)
 train_bigrams <- generate_ngrams(train_data_clean, 2)
train_trigrams <- generate_ngrams(train_data_clean, 3)
train_fourgrams <- generate_ngrams(train_data_clean, 4)

valid_unigrams <- generate_ngrams(valid_data_clean, 1)
valid_bigrams <- generate_ngrams(valid_data_clean, 2)
valid_trigrams <- generate_ngrams(valid_data_clean, 3)
valid_fourgrams <- generate_ngrams(valid_data_clean, 4)

test_unigrams <- generate_ngrams(test_data_clean, 1)
test_bigrams <- generate_ngrams(test_data_clean, 2)
test_trigrams <- generate_ngrams(test_data_clean, 3)
test_fourgrams <- generate_ngrams(test_data_clean, 4)

Exploratory Data Analysis

Let’s take a look at the top 10 features for each gram in the training set

# Function to count the top n most common n-grams
p_load(quanteda.textstats)

count_top_ngrams <- function(tokens_df, top_n = 10) {
  tokens_df %>%
    textstat_frequency() %>%
    head(top_n)
}

# Display the top 10 most common unigrams, bigrams, trigrams, and 4-grams for training data
top_train_unigrams <- count_top_ngrams(train_unigrams, top_n = 10)
top_train_bigrams <- count_top_ngrams(train_bigrams, top_n = 10)
top_train_trigrams <- count_top_ngrams(train_trigrams, top_n = 10)
top_train_fourgrams <- count_top_ngrams(train_fourgrams, top_n = 10)

# Print the top results
list(
  Top_Train_Unigrams = top_train_unigrams,
  Top_Train_Bigrams = top_train_bigrams,
  Top_Train_Trigrams = top_train_trigrams,
  Top_Train_Fourgrams = top_train_fourgrams
)
$Top_Train_Unigrams
   feature frequency rank docfreq group
1     will     25766    1   21281   all
2      one     25370    2   22169   all
3      get     24359    3   22020   all
4     like     24234    4   21590   all
5     said     24183    5   22007   all
6     just     24123    6   22337   all
7     time     21453    7   19069   all
8      can     20826    8   18325   all
9     year     19354    9   16691   all
10     day     18357   10   16326   all

$Top_Train_Bigrams
        feature frequency rank docfreq group
1     right now      1976    1    1935   all
2      year old      1975    2    1851   all
3     last year      1874    3    1814   all
4     look like      1673    4    1637   all
5      new york      1662    5    1547   all
6     feel like      1392    6    1341   all
7      year ago      1370    7    1345   all
8    last night      1336    8    1311   all
9  look forward      1235    9    1224   all
10  high school      1230   10    1118   all

$Top_Train_Trigrams
               feature frequency rank docfreq group
1     happi mother day       271    1     268   all
2        new york citi       202    2     193   all
3       happi new year       176    3     176   all
4  presid barack obama       165    4     165   all
5     look forward see       133    5     132   all
6         two year ago       117    6     117   all
7        new york time       114    7     114   all
8         new year eve        75    8      70   all
9      dream come true        72    9      72   all
10       five year ago        69   10      68   all

$Top_Train_Fourgrams
                               feature frequency rank docfreq group
1                 happi mother day mom        32    1      32   all
2            thank follow look forward        31    2      31   all
3             dow jone industri averag        27    3      27   all
4               new york stock exchang        27    3      26   all
5                  rock roll hall fame        24    5      23   all
6               happi new year everyon        24    5      24   all
7       calori protein carbohydr satur        22    7      22   all
8  protein carbohydr satur cholesterol        22    7      22   all
9   carbohydr satur cholesterol sodium        22    7      22   all
10      satur cholesterol sodium fiber        22    7      22   all
# Visalize

# Function to plot n-grams
plot_ngrams <- function(data, title) {
  data <- data %>%
    arrange(desc(frequency))  # Order data from most frequent to least frequent
  
  ggplot(data, aes(x = reorder(feature, frequency), y = frequency)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    theme_minimal() +
    labs(title = title, x = "N-Gram", y = "Frequency") +
    coord_flip()  # This makes it easier to read the n-grams
}

# Plotting each set of n-grams
p1 <- plot_ngrams(top_train_unigrams, "Top 10 Training Unigrams")
p2 <- plot_ngrams(top_train_bigrams, "Top 10 Training Bigrams")
p3 <- plot_ngrams(top_train_trigrams, "Top 10 Training Trigrams")
p4 <- plot_ngrams(top_train_fourgrams, "Top 10 Training Fourgrams")

# Print the plots
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)