Summary

In this milestone report, I demonstrated the inital steps of building a text prediction algorithm using the SwiftKey dataset. I successfully loaded and cleaned the text data, created unigram, bigram, and trigram models, and calculated word frequencies and probabilites.

load necessary libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
library(tokenizers)
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggplot2)

Training and Testing data

twitter_file <- file.path("data", "final", "en_US", "en_US.twitter.txt")
#load data using readLines
twitter <- readLines(twitter_file, encoding = "UTF-8", skipNul = TRUE)
head(twitter)
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."                                                                       
## [4] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"                           
## [5] "Words from a complete stranger! Made my birthday even better :)"                                                
## [6] "First Cubs game ever! Wrigley field is gorgeous. This is perfect. Go Cubs Go!"
set.seed(123)
samp <- sample(twitter, size = 100)
# Get the total number of elements
N <- length(samp)
#Figure out how many go to train
n_train <- floor(0.8 * N)
# Sample indices for the training set
train_80 <- sample(1:N, n_train)
# use indices to split training and testing set
train_set <- samp[train_80]
testing_set <- samp[-train_80]

Using the tokinizer function

For the Unigrams

token <- tokenize_words(train_set, lowercase = FALSE, strip_punct = TRUE)
head(token,1)
## [[1]]
##  [1] "Imagine"   "Liam"      "walking"   "in"        "on"        "Larry"    
##  [7] "sex"       "and"       "giving"    "the"       "boys"      "time"     
## [13] "outs"      "Lol"       "oh"        "daddy"     "direction"
# flattened into one big character vector 
all_words <- unlist(token)
# turn into a data frame
df_words <- data.frame(word = all_words)
# count frequencies
count_token <- count(df_words, word, sort = TRUE)
# Total number of tokens
total_tokens = sum(count_token$n)

Using the tokinizer function

for Bigram

# tokenize vector, and flatten in character vector
all_words2 <- unlist(tokenize_ngrams(train_set, 
                          lowercase = FALSE,
                          ngram_delim = " ",
                          n = 2,
                          n_min = 2))
head(all_words2, 1)
## [1] "Imagine Liam"
# Create a tibble with a cloumn named ngram
  bi <- tibble(ngram = as.character(all_words2))
  
  bi <- bi %>%
  separate(ngram,into = c("history", "next"), sep = " " )%>%
  count(history, `next`, sort = TRUE, name = "count")%>%
  group_by(history)%>%
  mutate(history_total = sum(count)) %>%
  ungroup()
  
# optional pruning / top-k per history
min_cnt <- 2
k <- 10
bi <- bi %>%
  filter(count >= min_cnt)%>%
  group_by(history)%>%
  slice_max(order_by = count, n = k, with_ties = FALSE)%>%
  ungroup()
# quick check
# how many rows before pruning?
n_before <- nrow(bi)
# what histories exits
bi %>% count(history, sort = TRUE) %>% head(20)
## # A tibble: 17 × 2
##    history     n
##    <chr>   <int>
##  1 I           6
##  2 if          2
##  3 it          2
##  4 to          2
##  5 you         2
##  6 a           1
##  7 and         1
##  8 at          1
##  9 for         1
## 10 going       1
## 11 gonna       1
## 12 in          1
## 13 on          1
## 14 so          1
## 15 thanks      1
## 16 they        1
## 17 when        1
#do any forms of "i" exits
bi %>%
  filter(history %in% c( "i", "I")) %>% arrange(desc(count))%>%
  head(10)
## # A tibble: 6 × 4
##   history `next` count history_total
##   <chr>   <chr>  <int>         <int>
## 1 I       was        3            36
## 2 I       am         2            36
## 3 I       can        2            36
## 4 I       know       2            36
## 5 I       tell       2            36
## 6 I       think      2            36

Using tokinezer function

for trigram

all_words3 <- unlist(tokenize_ngrams(train_set,
                                     lowercase = FALSE,
                                     ngram_delim = " ",
                                     n = 3,
                                     n_min = 3))
# Create a tibble for ngrmas with coulumns named ngrams for trigram
tri <- tibble(ngram = as.character(all_words3))

tri <- tri %>%
  separate(ngram, into = c("w1","w2", "w3"), sep = " ")%>%
  mutate(history = paste(w1, w2), next_token = w3)%>%
  count(history, next_token, sort = TRUE, name = "count")%>%
  group_by(history)%>%
  mutate(history_total = sum(count))%>%
  ungroup()
# drops NA values
 tri <- tri %>%
      drop_na(history, next_token)
 
 nrow(tri)
## [1] 870
min_cnt2 <- 1
k2 <- 10

tri <- tri%>%
  filter(count >= min_cnt2)%>%
  group_by(history)%>%
  slice_max(order_by = count, n = k2, with_ties = FALSE)%>%
  ungroup()
  
  #how many rows were pruning
  n_before2 <- nrow(tri)
  #what histories exists
  tri %>% count(history, sort = TRUE) %>%head(20)
## # A tibble: 20 × 2
##    history       n
##    <chr>     <int>
##  1 for the       6
##  2 going to      4
##  3 I was         3
##  4 and I         3
##  5 at the        3
##  6 I am          2
##  7 I can         2
##  8 I know        2
##  9 I tell        2
## 10 I think       2
## 11 a good        2
## 12 gonna be      2
## 13 if I          2
## 14 if they       2
## 15 in the        2
## 16 it is         2
## 17 on a          2
## 18 so much       2
## 19 they were     2
## 20 to be         2
  #do any forms of "i" exits
  tri %>%
    filter(history %in% c("i am", "the", "thanks for"))%>% arrange(desc(count))%>%
    head(10)
## # A tibble: 1 × 4
##   history    next_token count history_total
##   <chr>      <chr>      <int>         <int>
## 1 thanks for the            2             2

#Adding probablities

# take raw word counts(count_token), calculate probability of each word
uni1 <- count_token %>%
  mutate(prob = n / sum(n))   # prob = word count / total word count
# check that probabilities sum to 1
sum(uni1$prob)
## [1] 1
#count of bigram / total count for that history
bi <- bi %>%
  mutate(prob = count/history_total)
# Check sum of probabilities (will be less than 1 if pruned)
sum(bi$prob)
## [1] 6.576134
# Count how many distinct "history" contexts exist
n_distinct(bi$history)
## [1] 17
# group by history and show the top 10 most likely continuations
bi %>% group_by(history) %>%
  summarise(total_prob = sum(prob)) %>%
  arrange(desc(total_prob)) %>% head(10)
## # A tibble: 10 × 2
##    history total_prob
##    <chr>        <dbl>
##  1 going        0.8  
##  2 if           0.8  
##  3 thanks       0.667
##  4 when         0.667
##  5 for          0.5  
##  6 gonna        0.5  
##  7 it           0.364
##  8 I            0.361
##  9 so           0.333
## 10 at           0.3
# trigram count / total count for its 2-word history
tri <- tri %>%
  mutate(prob = count / history_total)
# Check total probability mass
sum(tri$prob)
## [1] 838
# Number of distinct 2-word histories in the trigram table
n_distinct(tri$history)
## [1] 838
# Preview trigram table
tri
## # A tibble: 870 × 5
##    history       next_token    count history_total  prob
##    <chr>         <chr>         <int>         <int> <dbl>
##  1 14 7          with              1             1     1
##  2 18 I          can               1             1     1
##  3 2012 home     game              1             1     1
##  4 23 left       in                1             1     1
##  5 3 seperate    conversations     1             1     1
##  6 5 23          left              1             1     1
##  7 7 More        days              1             1     1
##  8 7 with        5                 1             1     1
##  9 830 it's      gonna             1             1     1
## 10 APPROVES bill that              1             1     1
## # ℹ 860 more rows

top 20 unigram and bigram

top_uni <- uni1 %>%
  arrange(desc(prob)) %>%
  head(20)
  
ggplot(top_uni, aes( x = reorder(word, -prob),y = prob)) + geom_bar(stat = "identity", fill = "steelblue") + 
  labs(title = "Top 20 Unigrams by Probability", x = "Words", y = "Probability") + theme_minimal() 

top_bi <- bi %>%
  arrange(desc(prob)) %>%
  head(20)

ggplot(top_bi, aes(x = reorder(history, `next`), y = prob)) + geom_bar(stat = "identity", fill = "red") + labs(title = "Top 20 Bigrams by Probability", x = "Probability", y = "Bigram") + theme_minimal()

Build a backoff predictor

# Example input text we want to test
input_text <- "thanks for"
# Define a function to predict the next word using trigram probabilities
predict_trigram <- function(input_text, tri, k2 =3){
  # Split input text into lowercase tokens (words), remove whitespace
  tokens <- unlist(strsplit(tolower(input_text), "\\s+"))
  # Count how many tokens are in the input
  n <- length(tokens)
  # Only proceed if we have at least 2 tokens (needed for trigram history)
  if(n>= 2){
    # Get the last two words from the input as the "histoy"
    w1 <- tokens[n-1]
    w2 <- tokens[n]
    history2 <- paste(w1,w2)
   # Filter trigram table to rows that match this history
    tri_hits <- tri %>%
     filter(history == "thanks for")%>%
     arrange(desc(prob))%>% # Sort candidates by probability
     head(k2)   # Take the top k2 predictions
    
  # If results found, return them
   if(nrow(tri_hits)>0){
     return(tri_hits)
   } else{
     # If no trigram predictions, return a message
      return(data.frame(message = "No trigram found"))
   }
 } else{
   # If the input text is too short, return a message
    return(data.frame(message = "Not enough words for trigram"))
  }
}

predict_trigram("thanks for", tri, k2 = 3)
## # A tibble: 1 × 5
##   history    next_token count history_total  prob
##   <chr>      <chr>      <int>         <int> <dbl>
## 1 thanks for the            2             2     1
# Example input text for bigram
input_text2 <- "thanks going I"
# Define a function to predict the next word using bigram probabilites
predicted_biagram <- function (input_text2, bi, k3 = 3){
  # Tokenize and lowercase input text
  token2 <- unlist(strsplit(tolower(trimws(input_text2)), "\\s+"))
  # Count tokes
  n2 <-length(token2)
  # Only proceed if we have at least 1 token (needed for bigram history)
  if(n2 >= 1){
    h1 <- token2[n2] # Last word of input as history
    # Filter bigram table for rows matching that last word
    bi_hits <- bi %>%
      filter(history == h1)%>%
      arrange(desc(prob)) %>%  # Sort by probability
      head(k3)                # Take the top k3 predictions
    
    # If we found results, return them
    if(nrow(bi_hits) > 0){
      return(bi_hits)
      
    } else {
      # If no matches, return a message
      return(data.frame(message = "No biagram found"))
    }
  } else{
    # If the input text is empty, return a message
    return(data.frame(message = "Not enough words for biagram"))
  }
}
# get top two bigrams
predicted_biagram("thanks", bi, k3 =3)
## # A tibble: 1 × 5
##   history `next` count history_total  prob
##   <chr>   <chr>  <int>         <int> <dbl>
## 1 thanks  for        2             3 0.667
predicted_biagram("going", bi, k3 = 3)
## # A tibble: 1 × 5
##   history `next` count history_total  prob
##   <chr>   <chr>  <int>         <int> <dbl>
## 1 going   to         4             5   0.8
# Function to return the top k4 unigrams by probablility
predicted_unigram <- function(uni1, k4 = 5){
  uni1 %>% slice_max(prob, n = k4, with_ties = FALSE)%>%
    arrange(desc(prob))%>% # sort by descending order
    transmute(next_word = word, prob) %>%  # Rename columns fo 
    head(k4)          # Return top k4
}
# get top 5 unigrams
predicted_unigram(uni1, k4 = 5)
##   next_word       prob
## 1         I 0.03491756
## 2       the 0.02521823
## 3        to 0.02230844
## 4         a 0.02036857
## 5       you 0.01842871

Results

From the exploratory analysis, I created frequency tables and probability estimates for unigrams, bigrams, and trigrams.

Conclusion

In this milestone, I demonstrated that the data was successfully loaded, cleaned, and explored through the creation of unigram, bigram, and trigram models. The analysis higlighted the most frequent words and phrases, confirming that the dataset is appropriate for building a predictive text model. Like in the unigram bar plot we concluded that the distribution in the dataset is heavily skewed toward common stopwords, reflecting natural patterns of written English. Bigram plot emphasizes frequent two word combinations that are essential for context aware predictions. These results establish the foundation for the next stage, where I will implement a backoff-based prediction algorithm and eventually deploy it in a Shiny application. The progress so far shows that the project is on track and that the exploratory findings will directly guide the design of the final predictive system.