1 Import Data

Install and load required packages:

# Load Packages
suppressPackageStartupMessages(library(pacman))
p_load(tm, tidyverse, rebus, stringr, wordcloud, class, qdap, readtext, quanteda, stringi, filehash, RWeka, RColorBrewer)
# devtools::install_github("quanteda/quanteda.corpora")
# devtools::install_github("quanteda/quanteda.dictionaries")

Download and import text files:

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
ifelse(test = !file.exists("Coursera-SwiftKey.zip"), 
       yes = download.file(url = url, destfile = "Coursera-SwiftKey.zip", quiet = TRUE), 
       no = unzip(zipfile = "Coursera-SwiftKey.zip")
       )

[1] "./final/en_US/en_US.twitter.txt"

rm(url)
# file.remove("Coursera-SwiftKey.zip") # delete file

# Import English text files
twitter_en <- readLines(con = "en_US.twitter.txt", encoding = "UTF-8")
news_en <- readLines(con = "en_US.news.txt", encoding = "UTF-8")
blogs_en <- readLines(con = "en_US.blogs.txt", encoding = "UTF-8")

Calculate and print summary statistics for each file:

Files <- dir(path = "final/en_US/", full.names=TRUE)
Summary_Statistics <- data.frame(
           File = basename(Files), 
           `File Size (MB)` = round(file.size(Files)/(1024*1024),1), # Calculate file size (in MB)
           Documents = sapply(X = list(twitter_en, news_en, blogs_en), FUN = length), stringsAsFactors = FALSE #, # Calculate number of documents
           # Words = sapply(X = list(twitter_en, news_en, blogs_en), FUN = function(x) sum(word_count(x), na.rm = TRUE)) # Calculate number of words
           )

colnames(Summary_Statistics) <- c("File", "File Size (MB)", "Documents")
# Print Statistics
Summary_Statistics

# Remove obsolete objects
rm(Summary_Statistics)

Remove handles, tags and RT from Twitter dataset:

# Clean Twitter Data
# Remove Twitter Handles
twitter_en <- gsub(pattern = "(^|[^@\\w])@(\\w{1,15})\\b", replacement = "", x = twitter_en) 
# Remove Twitter Tags
twitter_en <- gsub(pattern = "#\\S+", replacement = "", x = twitter_en) 
# Remove RT references
twitter_en <- gsub(pattern = "^RT", replacement = " ", x = twitter_en)

Merge all datasets together:

# Merge all sources in one object
en_data <- c(twitter_en, news_en, blogs_en)
save(en_data, file = "en_data.RData")

# Remove obsolete objects
rm(twitter_en, news_en, blogs_en)

2 Create training, test and validation sets

Due to computational constraints we are only able to use a small fraction of the documents available.

We will partition 50% of the selected text for the training set and 25% for the test and validation sets.

set.seed(1)
# Reduce size of data
# load(file = "en_data.RData")
fraction <- 0.005
en_data_abridged <- sample(x = en_data, size = length(en_data)*fraction, replace = FALSE) 

# Create index for training set data
training_index <- sample(x = 1:length(en_data_abridged), size = length(en_data_abridged)*0.50, replace = FALSE) # [TD]

# Create Training Set (50% of documents)
training_en <- en_data_abridged[training_index]

# Split testing data into testing and validation set
testing_en <- en_data_abridged[-training_index]

# Create index for test set data
testing_index <- sample(x = 1:length(testing_en), size = length(testing_en)*0.50, replace = FALSE) # [TD]

# Create Validation Set (50/2 = 25% of documents)
validation_en <- testing_en[-testing_index]

# Create Test Set (50/2 = 25% of documents)
testing_en <- testing_en[testing_index]

# head(training_en)
# head(validation_en)
# head(testing_en)
# length(training_en)
# length(validation_en)
# length(testing_en)

# length(validation_en) + length(testing_en) + length(training_en) == length(en_data)

# Save files
saveRDS(object = training_en, file = "training_en.rds")
saveRDS(testing_en, file = "testing_en.rds")
saveRDS(validation_en, file = "validation_en.rds")

# Remove obsolete objects
rm(validation_en, testing_en, training_en, fraction, en_data)

3 Cleaning and Processing of Text Data

3.1 Create and Clean Corpus

Create corpus of the merged source of training English text:

training_en <- readRDS(file = "training_en.rds")

# Change encoding
training_en <- iconv(training_en, 'UTF-8', 'ASCII')
training_en <- str_replace_all(training_en,"[^[:graph:]]", " ") 

# head(training_en, 25)
# length(training_en)

# Remove NA values
training_en <- training_en[!is.na(training_en)]

# Create volatile corpus
training_corpus <- training_en %>% 
  VectorSource() %>% 
  VCorpus()

# class(training_corpus)

The corpus created can be cleaned by using the tm_map function.

We will use this function to:
* Remove extra white spaces * Remove punctuation * Remove numbers * Remove profanity * Lowercase all terms

# Import list of profanity terms
profanity <- readLines("Profanity.txt") # Source: https://www.freewebheaders.com/download/files/base-list-of-bad-words_text-file_2018_07_30.zip

# Create function to clean corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, stripWhitespace) # remove extra white spaces
  # corpus <- tm_map(corpus, content_transformer(replace_abbreviation)) # Replace abbreviations
  corpus <- tm_map(corpus, removePunctuation, preserve_intra_word_contractions = TRUE, preserve_intra_word_dashes = TRUE) # remove punctuation, keep contractions and intra-word dashes
  corpus <- tm_map(corpus, removeNumbers) # remove numbers
  corpus <- tm_map(corpus, removeWords, c(profanity, "RT")) # remove profanity from corpus and retweet reference
  corpus <- tm_map(corpus, content_transformer(tolower)) # lower case terms
  # corpus <- tm_map(corpus, function(x) iconv(enc2utf8(x), sub = "byte"))
  return(corpus)
}

save(clean_corpus, file = "clean_corpus.RData")

# Process Corpus
training_corpus_clean <- clean_corpus(training_corpus)
save(training_corpus_clean, file = "training_corpus_clean.RData")

# training_corpus_clean_enc <- tm_map(training_corpus_clean, function(x) iconv(enc2utf8(x), sub = "byte"))
# training_corpus_clean_usable <- str_replace_all(training_corpus_clean,"[^[:graph:]]", " ")

# training_corpus_clean[[15]][1]
# class(training_corpus_clean)

3.2 Create Term Document Matrix (TDM)

With a processed corpus we can create a term document matrix (TDM) or a document term matrix (DTM).

# Create Term Document Matrix
training_tdm <- TermDocumentMatrix(x = training_corpus_clean)

# Remove Sparse Terms
sparsity_value <- 0.999
training_tdm_sparse <- removeSparseTerms(x = training_tdm, sparse = sparsity_value)
training_tdm_sparse

<<TermDocumentMatrix (terms: 1537, documents: 7401)>>
Non-/sparse entries: 63610/11311727
Sparsity           : 99%
Maximal term length: 15
Weighting          : term frequency (tf)

# Save TDM
saveRDS(training_tdm_sparse, file = "training_tdm_sparse.rds")

# Remove obsolete objects
rm(training_en, training_corpus, profanity, training_tdm)

4 N-gram Tokenization

We can use the NGramTokenizer function from the RWeka package to tokenize the documents. We will create objects with Unigrams, Bigrams, Trigrams and 4-Grams. We will also use the removeSparseTerms function to remove sparse terms.

# Create custom tokenizing functions for N-Grams
Tokenizer_1Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 1, max = 1))}
Tokenizer_2Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 2, max = 2))}
Tokenizer_3Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 3, max = 3))}
Tokenizer_4Gram <- function(x) {NGramTokenizer(x, Weka_control(min = 4, max = 4))}
save(list = c("Tokenizer_1Gram", "Tokenizer_2Gram", "Tokenizer_3Gram", "Tokenizer_4Gram"), file = "Tokenizers_NGram.RData")

options(digits = 10)
# 1-Gram TDM
sparsity_value <- 0.9999
TDM_1Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_1Gram))
TDM_1Gram_sparse <- removeSparseTerms(TDM_1Gram_sparse, sparse = sparsity_value)
TDM_1Gram_sparse

<<TermDocumentMatrix (terms: 15489, documents: 7401)>>
Non-/sparse entries: 86924/114547165
Sparsity           : 100%
Maximal term length: 68
Weighting          : term frequency (tf)

save(TDM_1Gram_sparse, file = "TDM_1Gram_sparse.RData")

# 2-Gram TDM
sparsity_value <- 0.999
TDM_2Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_2Gram))
TDM_2Gram_sparse <- removeSparseTerms(TDM_2Gram_sparse, sparse = sparsity_value)
TDM_2Gram_sparse

<<TermDocumentMatrix (terms: 1322, documents: 7401)>>
Non-/sparse entries: 28253/9755869
Sparsity           : 100%
Maximal term length: 15
Weighting          : term frequency (tf)

save(TDM_2Gram_sparse, file = "TDM_2Gram_sparse.RData")

# 3-Gram TDM
sparsity_value <- 0.999
TDM_3Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_3Gram))
TDM_3Gram_sparse <- removeSparseTerms(TDM_3Gram_sparse, sparse = sparsity_value)
TDM_3Gram_sparse

<<TermDocumentMatrix (terms: 179, documents: 7401)>>
Non-/sparse entries: 2443/1322336
Sparsity           : 100%
Maximal term length: 20
Weighting          : term frequency (tf)

save(TDM_3Gram_sparse, file = "TDM_3Gram_sparse.RData")

# 4-Gram TDM
sparsity_value <- 0.9995
TDM_4Gram_sparse <- TermDocumentMatrix(training_corpus_clean, control = list(tokenize = Tokenizer_4Gram))
TDM_4Gram_sparse <- removeSparseTerms(TDM_4Gram_sparse, sparse = sparsity_value)
TDM_4Gram_sparse

<<TermDocumentMatrix (terms: 76, documents: 7401)>>
Non-/sparse entries: 461/562015
Sparsity           : 100%
Maximal term length: 21
Weighting          : term frequency (tf)

save(TDM_4Gram_sparse, file = "TDM_4Gram_sparse.RData")

# Remove obsolete objects
rm(Tokenizer_1Gram, Tokenizer_2Gram, Tokenizer_3Gram, Tokenizer_4Gram)

5 Exploratory Data Analysis

5.1 Popular N-Grams

Print some of the most frequent N-Grams:

min_freq <- 50
# Print 1-Grams with a minimum of least n occurences
TDM_1Gram_sparse %>% findFreqTerms(lowfreq = min_freq)

  [1] "about"     "actually"  "after"     "again"     "all"      
  [6] "already"   "also"      "always"    "amazing"   "and"      
 [11] "another"   "any"       "anything"  "are"       "around"   
 [16] "away"      "awesome"   "back"      "bad"       "because"  
 [21] "been"      "before"    "being"     "believe"   "best"     
 [26] "better"    "big"       "birthday"  "blog"      "book"     
 [31] "both"      "but"       "call"      "can"       "check"    
 [36] "come"      "coming"    "could"     "day"       "days"     
 [41] "did"       "didn"      "does"      "doesn"     "doing"    
 [46] "don"       "done"      "down"      "each"      "end"      
 [51] "even"      "ever"      "every"     "everyone"  "excited"  
 [56] "family"    "feel"      "few"       "find"      "first"    
 [61] "follow"    "food"      "for"       "found"     "free"     
 [66] "friends"   "from"      "fun"       "game"      "get"      
 [71] "getting"   "girl"      "give"      "god"       "going"    
 [76] "gonna"     "good"      "got"       "great"     "guys"     
 [81] "had"       "haha"      "happy"     "hard"      "has"      
 [86] "hate"      "have"      "having"    "hear"      "help"     
 [91] "her"       "here"      "hey"       "him"       "his"      
 [96] "home"      "hope"      "house"     "how"       "into"     
[101] "its"       "just"      "keep"      "kids"      "know"     
[106] "last"      "let"       "life"      "like"      "little"   
[111] "live"      "lol"       "long"      "look"      "looking"  
[116] "lot"       "love"      "made"      "make"      "making"   
[121] "man"       "many"      "may"       "maybe"     "might"    
[126] "more"      "morning"   "most"      "much"      "music"    
[131] "must"      "need"      "never"     "new"       "next"     
[136] "nice"      "night"     "not"       "now"       "off"      
[141] "old"       "one"       "only"      "other"     "our"      
[146] "out"       "over"      "own"       "party"     "people"   
[151] "place"     "play"      "please"    "post"      "put"      
[156] "ready"     "real"      "really"    "right"     "said"     
[161] "same"      "say"       "school"    "see"       "she"      
[166] "should"    "show"      "since"     "some"      "someone"  
[171] "something" "soon"      "start"     "still"     "stop"     
[176] "such"      "summer"    "sure"      "take"      "talk"     
[181] "tell"      "than"      "thank"     "thanks"    "that"     
[186] "the"       "their"     "them"      "then"      "there"    
[191] "these"     "they"      "thing"     "things"    "think"    
[196] "this"      "those"     "though"    "through"   "time"     
[201] "today"     "tomorrow"  "tonight"   "too"       "try"      
[206] "trying"    "tweet"     "twitter"   "two"       "until"    
[211] "use"       "used"      "very"      "wait"      "want"     
[216] "was"       "watching"  "way"       "week"      "weekend"  
[221] "well"      "went"      "were"      "what"      "when"     
[226] "where"     "which"     "while"     "who"       "why"      
[231] "will"      "win"       "wish"      "with"      "without"  
[236] "work"      "working"   "world"     "would"     "year"     
[241] "years"     "yes"       "yet"       "you"       "your"

# Print 2-Grams with a minimum of least n occurences
TDM_2Gram_sparse %>% findFreqTerms(lowfreq = min_freq)

 [1] "a few"      "a good"     "a great"    "a little"   "a lot"     
 [6] "all of"     "all the"    "and a"      "and i"      "and the"   
[11] "as a"       "at the"     "be a"       "but i"      "by the"    
[16] "can t"      "didn t"     "do you"     "doesn t"    "don t"     
[21] "for a"      "for the"    "from the"   "going to"   "has been"  
[26] "have a"     "have been"  "have to"    "i am"       "i can"     
[31] "i don"      "i had"      "i have"     "i just"     "i know"    
[36] "i ll"       "i love"     "i m"        "i think"    "i ve"      
[41] "i was"      "i will"     "i would"    "if i"       "if you"    
[46] "in a"       "in my"      "in the"     "is a"       "is the"    
[51] "it is"      "it s"       "it was"     "need to"    "of a"      
[56] "of my"      "of the"     "on a"       "on my"      "on the"    
[61] "one of"     "out of"     "s a"        "so i"       "so much"   
[66] "thank you"  "thanks for" "that i"     "that is"    "that s"    
[71] "that the"   "the best"   "the first"  "the most"   "the same"  
[76] "this is"    "to a"       "to be"      "to do"      "to get"    
[81] "to go"      "to have"    "to make"    "to see"     "to the"    
[86] "want to"    "was a"      "we are"     "we have"    "when i"    
[91] "when you"   "will be"    "with a"     "with the"   "you are"   
[96] "you can"    "you have"   "you re"

# Print 3-Grams with a minimum of least n occurences
TDM_3Gram_sparse %>% findFreqTerms(lowfreq = 10)

  [1] "a bit of"             "a couple of"          "a great day"         
  [4] "a lot of"             "all of the"           "and i m"             
  [7] "at the end"           "back to the"          "be able to"          
 [10] "but i m"              "can t believe"        "can t wait"          
 [13] "d like to"            "do you think"         "don t forget"        
 [16] "don t have"           "don t know"           "end of the"          
 [19] "for the follow"       "go to the"            "going to be"         
 [22] "have a great"         "i am not"             "i can t"             
 [25] "i didn t"             "i don t"              "i feel like"         
 [28] "i guess i"            "i had a"              "i have a"            
 [31] "i have been"          "i have the"           "i have to"           
 [34] "i haven t"            "i know i"             "i ll be"             
 [37] "i love you"           "i m a"                "i m going"           
 [40] "i m gonna"            "i m in"               "i m just"            
 [43] "i m not"              "i m so"               "i m still"           
 [46] "i m sure"             "i need a"             "i need to"           
 [49] "i think i"            "i ve been"            "i want to"           
 [52] "i wanted to"          "i will be"            "i wish i"            
 [55] "if you are"           "if you can"           "if you have"         
 [58] "if you re"            "in order to"          "in the world"        
 [61] "is going to"          "is not a"             "is one of"           
 [64] "it has been"          "it is a"              "it s a"              
 [67] "it s been"            "it s just"            "it s not"            
 [70] "it s the"             "it was a"             "let me know"         
 [73] "let s go"             "looking forward to"   "m going to"          
 [76] "mother s day"         "of my favorite"       "of the day"          
 [79] "of the year"          "one of my"            "one of the"          
 [82] "out of the"           "part of the"          "rest of the"         
 [85] "s going to"           "so i m"               "some of the"         
 [88] "t wait for"           "t wait to"            "t want to"           
 [91] "thank you for"        "thanks for following" "thanks for the"      
 [94] "the end of"           "the fact that"        "the first time"      
 [97] "the rest of"          "there is a"           "there s a"           
[100] "this is a"            "to be a"              "to be the"           
[103] "to go to"             "to have a"            "to make a"           
[106] "to see the"           "want to be"           "we don t"            
[109] "we need to"           "when i was"           "when you re"         
[112] "would love to"        "you can t"            "you don t"           
[115] "you for the"          "you go to"            "you have a"          
[118] "you have to"          "you want to"

# Print 4-Grams with a minimum of least n occurences
TDM_4Gram_sparse %>% findFreqTerms(lowfreq = 5)

 [1] "at the end of"         "at the same time"     
 [3] "can t wait for"        "can t wait to"        
 [5] "don t have a"          "don t know what"      
 [7] "don t think i"         "don t want to"        
 [9] "for the shout out"     "going to be a"        
[11] "happy mother s day"    "have a great day"     
[13] "i am going to"         "i can t believe"      
[15] "i can t wait"          "i d like to"          
[17] "i d love to"           "i don t have"         
[19] "i don t know"          "i don t think"        
[21] "i m going to"          "i think it s"         
[23] "if you don t"          "is going to be"       
[25] "is one of the"         "it s time to"         
[27] "one of my favorite"    "s going to be"        
[29] "so much for the"       "t wait to see"        
[31] "thank you for the"     "thank you so much"    
[33] "thanks for the follow" "thanks so much for"   
[35] "the end of the"        "the rest of the"      
[37] "you don t know"        "you don t see"        
[39] "you go to the"

5.2 Unigram

Print table with summary of most popular Unigrams.

options(digits = 2)

TDM_1Gram_matrix <- as.matrix(TDM_1Gram_sparse)
TDM_1Gram_Values <- sort(rowSums(TDM_1Gram_matrix), decreasing = TRUE)
TDM_1Gram_DF <- data.frame(Term = names(TDM_1Gram_Values), Count = TDM_1Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_1Gram_DF <- TDM_1Gram_DF %>% mutate(Frequency = Count/sum(Count) %>% round(1))
TDM_1Gram_DF[1:25, ]

pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_1Gram_DF$Term, freq = TDM_1Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_1Gram_matrix, TDM_1Gram_Values)

TDM_1Gram_DF %>% 
  mutate(`Frequency %`= Count/sum(Count)*100 %>% round(2)) %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 Unigrams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())

5.3 Bigram

TDM_2Gram_matrix <- as.matrix(TDM_2Gram_sparse)
TDM_2Gram_Values <- sort(rowSums(TDM_2Gram_matrix), decreasing = TRUE)
TDM_2Gram_DF <- data.frame(Term = names(TDM_2Gram_Values), Count = TDM_2Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_2Gram_DF <- TDM_2Gram_DF %>% mutate(Frequency = Count/sum(Count))
TDM_2Gram_DF[1:25, ]

pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_2Gram_DF$Term, freq = TDM_2Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_2Gram_matrix, TDM_2Gram_Values)

TDM_2Gram_DF %>% 
  mutate(`Frequency %`= Count/sum(Count)*100 %>% round(2)) %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 Bigrams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())

5.4 Trigram

TDM_3Gram_matrix <- as.matrix(TDM_3Gram_sparse)
TDM_3Gram_Values <- sort(rowSums(TDM_3Gram_matrix), decreasing = TRUE)
TDM_3Gram_DF <- data.frame(Term = names(TDM_3Gram_Values), Count = TDM_3Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_3Gram_DF <- TDM_3Gram_DF %>% mutate(Frequency = Count/sum(Count))

pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_3Gram_DF$Term, freq = TDM_3Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_3Gram_matrix, TDM_3Gram_Values)

TDM_3Gram_DF %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 Trigrams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())

5.5 4-Gram

TDM_4Gram_matrix <- as.matrix(TDM_4Gram_sparse)
TDM_4Gram_Values <- sort(rowSums(TDM_4Gram_matrix), decreasing = TRUE)
TDM_4Gram_DF <- data.frame(Term = names(TDM_4Gram_Values), Count = TDM_4Gram_Values, row.names = NULL, stringsAsFactors = FALSE)
TDM_4Gram_DF <- TDM_4Gram_DF %>% mutate(Frequency = Count/sum(Count))

pal <- brewer.pal(n = 9,name = "Reds")[3:9]
wordcloud(words = TDM_4Gram_DF$Term, freq = TDM_4Gram_DF$Frequency, max.words = 100, colors = pal, random.order = F)

rm(TDM_4Gram_matrix, TDM_4Gram_Values)

TDM_4Gram_DF %>% 
  .[1:50,] %>% 
  arrange(desc(Count)) %>% 
  mutate() %>% 
  ggplot(mapping = aes(x = reorder(Term, Count), y = Count)) +
    geom_col(fill = "tan") +
    coord_flip(expand = c(0.05,0)) +
    theme_bw() +
    labs(x = "Terms", title = "Top 50 4-Grams", subtitle = "Taken from a subset of the whole dataset") +
    theme(axis.ticks = element_blank())

rm(TDM_1Gram_sparse, TDM_2Gram_sparse, TDM_3Gram_sparse, TDM_4Gram_sparse)

6 Predicting Words

word <- "the"

TDM_2Gram_DF[grepl(pattern = paste0("^", word), x = TDM_2Gram_DF$Term),]

word <- "thank "

TDM_2Gram_DF[grepl(pattern = paste0("^", word), x = TDM_2Gram_DF$Term),]

word <- "how are"

TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]

word <- "you have"

TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]

word <- "there is"

TDM_3Gram_DF[grepl(pattern = paste0("^", word), x = TDM_3Gram_DF$Term),]

Swiftkey Word Predictor

Coursera Data Science Specialization

Jean Dos Santos