Objective

This report is to establish a model to predict next word in a sentence. The floolwing sentences are listed and we want to predict the next word.

library(tm)
library(dplyr)
library(ggplot2)
library(pryr)
library(stringr)
library(RWeka)
qb1<-"The guy in front of me just bought a pound of bacon, a bouquet, and a case of"
qb2<-"You're the reason why I smile everyday. Can you follow me please? It would mean the"
qb3<-"Hey sunshine, can you follow me and make me the"
qb4<-"Very early observations on the Bills game: Offense still struggling but the"
qb5<-"Go on a romantic date at the"
qb6<-"Well I'm pretty sure my granny has some old bagpipes in her garage I'll dust them off and be on my"
qb7<-"Ohhhhh #PointBreak is on tomorrow. Love that film and haven't seen it in quite some"
qb8<-"After the ice bucket challenge Louis will push his long wet hair out of his eyes with his little"
qb9<-"Be grateful for the good times and keep the faith during the"
qb10<-"If this isn't the cutest thing you've ever seen, then you must be"

qc1<-"When you breathe, I want to be the air for you. I'll be there for you, I'd live and I'd"
qc2<-"Guy at my table's wife got up to go to the bathroom and I asked about dessert and he started telling me about his"
qc3<-"I'd give anything to see arctic monkeys this"
qc4<-"Talking to your mom has the same effect as a hug and helps reduce your"
qc5<-"When you were in Holland you were like 1 inch away from me but you hadn't time to take a"
qc6<-"I'd just like all of these questions answered, a presentation of evidence, and a jury to settle the"
qc7<-"I can't deal with unsymetrical things. I can't even hold an uneven number of bags of groceries in each"
qc8<-"Every inch of you is perfect from the bottom to the"
qc9<-"I’m thankful my childhood was filled with imagination and bruises from playing"
qc10<-"I like how the same people are in almost all of Adam Sandler's"
qblist<-list(qb1,qb2,qb3,qb4,qb5,qb6,qb7,qb8,qb9,qb10)
qclist<-list(qc1,qc2,qc3,qc4,qc5,qc6,qc7,qc8,qc9,qc10)

Loading Data

I will load the data from US_Twitter to create a sample model first. Then I will analyze the rest source seperately.

df1 = readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8")
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 167155 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 268547 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1274086 appears to contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt", encoding =
## "UTF-8"): line 1759032 appears to contain an embedded nul
df.blogs = readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8")
df.news = readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8")

Analyze the size, words, and other parameters of the source

length(df1)
sum(sapply(gregexpr("\\W+", df1), length) + 1)

The length of twitter is 2360148 lines and there are 32793399 words in it. Since the file is too big, I will create a sample file with only 0.001 of its content of it.

s_df1 = base::sample(df1, length(df1)*0.01)

Calculate the maxlength and meanlength documents in the sample file

doc_length = sapply(gregexpr("\\W+", s_df1), length)+1
mean(doc_length)
## [1] 13.92026
max(doc_length)
## [1] 37

Text Mining

Use TM library to convert vector of documents into DocumentTermMatrix file, which is easy for analysis Before conversion, I will do some processing to the documents.

#s_Corpus = VCorpus(VectorSource(s_df1))
#s_Corpus = tm_map(s_Corpus, tolower)
#s_Corpus = tm_map(s_Corpus, removePunctuation)
#s_Corpus = tm_map(s_Corpus, removeNumbers)
#s_Corpus = tm_map(s_Corpus, removeWords, stopwords("english"))
#s_Corpus = tm_map(s_Corpus, stemDocument)
#s_Corpus = tm_map(s_Corpus, stripWhitespace)
#s_Corpus = tm_map(s_Corpus, PlainTextDocument)
#uni.s.DTM = DocumentTermMatrix(s_Corpus)
#bi_tokenizer = function (x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
#tri_tokenizer = function (x) unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
#qua_tokenizer = function (x) unlist(lapply(ngrams(words(x), 4), paste, collapse = " "), use.names = FALSE)
#bi.s.DTM = DocumentTermMatrix(newCorpus, control = list(tokenize = bi_tokenizer))
#tri.s.DTM = DocumentTermMatrix(newCorpus, control = list(tokenize = tri_tokenizer))
#qua.s.DTM = DocumentTermMatrix(newCorpus, control = list(tokenize = qua_tokenizer))

Now to get the prediction, we need to extract the last n-grams terms from the sentence first. This is the observation of prediction.

ngram.extract = function(str, n) {
        sep.chr = tail(strsplit(str, split = " ")[[1]], n)
        return(paste(sep.chr, sep = "", collapse = " "))
}

Prediction

first check the what will be next word in the sample vectors after “a case of”

#x = grep("a case of", s_df1, value = TRUE)
#x
match = regmatches(df1, regexpr("a case of (.*?) ", df1))
match
##   [1] "a case of Red "                   
##   [2] "a case of the "                   
##   [3] "a case of water "                 
##   [4] "a case of the "                   
##   [5] "a case of water "                 
##   [6] "a case of plagiarism "            
##   [7] "a case of the "                   
##   [8] "a case of Shameka "               
##   [9] "a case of the "                   
##  [10] "a case of carpal "                
##  [11] "a case of whiplash "              
##  [12] "a case of Anchor "                
##  [13] "a case of keystone "              
##  [14] "a case of beer, "                 
##  [15] "a case of the "                   
##  [16] "a case of rum "                   
##  [17] "a case of the "                   
##  [18] "a case of the "                   
##  [19] "a case of the "                   
##  [20] "a case of SPRING "                
##  [21] "a case of Miller "                
##  [22] "a case of sweet "                 
##  [23] "a case of the "                   
##  [24] "a case of #SheThinksShesHot...My "
##  [25] "a case of the "                   
##  [26] "a case of the "                   
##  [27] "a case of \"parents "             
##  [28] "a case of mind "                  
##  [29] "a case of greatness "             
##  [30] "a case of beer "                  
##  [31] "a case of from "                  
##  [32] "a case of \"The "                 
##  [33] "a case of IC "                    
##  [34] "a case of beer "                  
##  [35] "a case of the "                   
##  [36] "a case of the "                   
##  [37] "a case of the "                   
##  [38] "a case of the "                   
##  [39] "a case of the "                   
##  [40] "a case of extremes, "             
##  [41] "a case of 2006 "                  
##  [42] "a case of that "                  
##  [43] "a case of Renee "                 
##  [44] "a case of the "                   
##  [45] "a case of the "                   
##  [46] "a case of Thermoplastic "         
##  [47] "a case of the "                   
##  [48] "a case of Mondays "               
##  [49] "a case of the "                   
##  [50] "a case of Nugget "                
##  [51] "a case of lead "                  
##  [52] "a case of the "                   
##  [53] "a case of food "                  
##  [54] "a case of the "                   
##  [55] "a case of the "                   
##  [56] "a case of the "                   
##  [57] "a case of the "                   
##  [58] "a case of excusable "             
##  [59] "a case of bud "                   
##  [60] "a case of Pepsi, "                
##  [61] "a case of Miller "                
##  [62] "a case of the "                   
##  [63] "a case of jet "                   
##  [64] "a case of the "                   
##  [65] "a case of the "                   
##  [66] "a case of the "                   
##  [67] "a case of the "                   
##  [68] "a case of books. "                
##  [69] "a case of sexual "                
##  [70] "a case of the "                   
##  [71] "a case of \"too "                 
##  [72] "a case of mind "                  
##  [73] "a case of the "                   
##  [74] "a case of the "                   
##  [75] "a case of \"Arrested "            
##  [76] "a case of the "                   
##  [77] "a case of psychic "               
##  [78] "a case of beer "                  
##  [79] "a case of the "                   
##  [80] "a case of C+Swiss "               
##  [81] "a case of CDs "                   
##  [82] "a case of Miller "                
##  [83] "a case of the "                   
##  [84] "a case of the "                   
##  [85] "a case of \"The "                 
##  [86] "a case of poor "                  
##  [87] "a case of Franks "                
##  [88] "a case of Gin! "                  
##  [89] "a case of jealousy "              
##  [90] "a case of mistaken "              
##  [91] "a case of the "                   
##  [92] "a case of Boxer. "                
##  [93] "a case of idle "                  
##  [94] "a case of the "                   
##  [95] "a case of an "                    
##  [96] "a case of do "                    
##  [97] "a case of spring "                
##  [98] "a case of beer. "                 
##  [99] "a case of the "                   
## [100] "a case of this "                  
## [101] "a case of Surge "                 
## [102] "a case of this "                  
## [103] "a case of writer’s "              
## [104] "a case of the "                   
## [105] "a case of beer. "                 
## [106] "a case of beer "                  
## [107] "a case of the "                   
## [108] "a case of ProPenn "               
## [109] "a case of duct "                  
## [110] "a case of the "                   
## [111] "a case of damaged "               
## [112] "a case of disgusting "            
## [113] "a case of Monday "                
## [114] "a case of the "                   
## [115] "a case of silver "                
## [116] "a case of #vernors "              
## [117] "a case of beer "                  
## [118] "a case of the "                   
## [119] "a case of the "                   
## [120] "a case of the "                   
## [121] "a case of high "                  
## [122] "a case of no "                    
## [123] "a case of beer "                  
## [124] "a case of Lena "                  
## [125] "a case of the "                   
## [126] "a case of mountain "              
## [127] "a case of a "                     
## [128] "a case of the "                   
## [129] "a case of wine "                  
## [130] "a case of \"Bad "                 
## [131] "a case of the "                   
## [132] "a case of hairspray "             
## [133] "a case of suds, "                 
## [134] "a case of the "                   
## [135] "a case of the "                   
## [136] "a case of Dundee. "               
## [137] "a case of the "                   
## [138] "a case of beer "                  
## [139] "a case of the "                   
## [140] "a case of luck "                  
## [141] "a case of \"write "               
## [142] "a case of the "                   
## [143] "a case of knowing "               
## [144] "a case of the "                   
## [145] "a case of cold "                  
## [146] "a case of bananas "               
## [147] "a case of the "                   
## [148] "a case of sunglasses "            
## [149] "a case of Coors "
matchlist = gsub("a case of | $","", match)
matchnames = names(table(matchlist))
matchfreq = table(matchlist)
caseofmatch = data.frame(item = matchnames, freq = matchfreq)
caseofmatch = data.frame(item = caseofmatch$item, freq = caseofmatch$freq.Freq)
caseofmatch = caseofmatch[order(-caseofmatch$freq),]

we can the most frequent word after “a case of” is “the”. The second most frequent word is “beer”

Now let’s extract all the 3-grams in the quiz b before prediction word.

qblist.3gram = sapply(qblist, ngram.extract, 3)

function to extract the last 2-grams in a sentence

qblist.2gram = sapply(qblist, ngram.extract, 2)
qclist.3gram = sapply(qclist, ngram.extract, 3)

Then create a function to extract the word after this 3-grams term from source file, make a data frame to list highest frequent word.

word3.predict = function(datasource, term) {
        match = NULL
        matchlist = NULL
        match = regmatches(datasource, regexpr(paste(term, "(.*?) "), datasource))
        matchlist = gsub(paste(term, "| $"),"", match)
        return(matchlist)
}

Then we can find the most frequent word in the following of 3-grams term in the list

match.list = sapply(qblist.3gram, word3.predict, datasource = df1)
cgramText = vector()
cgramCount = vector()
nw1Text = vector()
nw1Count = vector()
nw2Text = vector()
nw2Count = vector()
nw3Text = vector()
nw3Count = vector()
for (i in 1:length(match.list)) {
        temp.vec = match.list[i]
        cgramText[i] = qblist.3gram[i]
        cgramCount[i] = sum(table(match.list[i]))
        if (is.na(table(match.list[i])[1])) {
                nw1Text[i] = NA
        }
        else {
                nw1Text[i] = names(sort(table(match.list[i]), decreasing = TRUE)[1])
        }
        nw1Count [i] = as.numeric(sort(table(match.list[i]), decreasing = TRUE)[1])
        if (is.na(table(match.list[i])[2])) {
                nw2Text[i] = NA
        }
        else {
                nw2Text[i] = names(sort(table(match.list[i]), decreasing = TRUE)[2])
        }
        nw2Count [i] = as.numeric(sort(table(match.list[i]), decreasing = TRUE)[2])
        if (is.na(table(match.list[i])[3])) {
                nw2Text[i] = NA
        }
        else {
                nw3Text[i] = names(sort(table(match.list[i]), decreasing = TRUE)[3])
        }
        nw3Count [i] = as.numeric(sort(table(match.list[i]), decreasing = TRUE)[3])
}

Now let’s see the result of the prediction

pred.rts = data.frame(cgramText, cgramCount, nw1Text, nw1Count, nw2Text, nw2Count, nw3Text, nw3Count)
pred.rts
##             cgramText cgramCount  nw1Text nw1Count nw2Text nw2Count
## 1           a case of        149      the       57    beer        7
## 2      would mean the        171    world      151   WORLD        4
## 3         make me the         44 happiest       24    most        3
## 4  struggling but the          0     <NA>       NA    <NA>       NA
## 5         date at the         12     "art        1     App        1
## 6            be on my        144      way       24    show        6
## 7       in quite some          5    time.        3   time!        1
## 8     with his little          6    bitty        1 brother        1
## 9    faith during the          0     <NA>       NA    <NA>       NA
## 10        you must be        218        a       27      so       15
##     nw3Text nw3Count
## 1    Miller        3
## 2    world!        3
## 3      16th        2
## 4      <NA>       NA
## 5    bottom        1
## 6       own        5
## 7    time!!        1
## 8  brother.        1
## 9      <NA>       NA
## 10       on        8

After the test, we can also package the code above as a function to predict any list in any files.

pred.word.fun = function (data.input, gramlist) {
        match.list = sapply(gramlist, word3.predict, datasource = data.input)
        cgramText = vector()
        cgramCount = vector()
        nw1Text = vector()
        nw1Count = vector()
        nw2Text = vector()
        nw2Count = vector()
        nw3Text = vector()
        nw3Count = vector()
        for (i in 1:length(match.list)) {
              temp.vec = match.list[i]
              cgramText[i] = gramlist[i]
             cgramCount[i] = sum(table(match.list[i]))
             if (is.na(table(match.list[i])[1])) {
                     nw1Text[i] = NA
             }
             else {
                    nw1Text[i] = names(sort(table(match.list[i]), decreasing = TRUE)[1])
            }
            nw1Count [i] = as.numeric(sort(table(match.list[i]), decreasing = TRUE)[1])
            if (is.na(table(match.list[i])[2])) {
                   nw2Text[i] = NA
            }
            else {
                    nw2Text[i] = names(sort(table(match.list[i]), decreasing = TRUE)[2])
                }
                nw2Count [i] = as.numeric(sort(table(match.list[i]), decreasing = TRUE)[2])
                if (is.na(table(match.list[i])[3])) {
                        nw2Text[i] = NA
                }
                else {
                        nw3Text[i] = names(sort(table(match.list[i]), decreasing = TRUE)[3])
                }
                nw3Count [i] = as.numeric(sort(table(match.list[i]), decreasing = TRUE)[3])
        }
        pred.rts = data.frame(cgramText, cgramCount, nw1Text, nw1Count, nw2Text, nw2Count, nw3Text, nw3Count)
        return(pred.rts)
}

We can analyze the prediction in other form including blogs and news using the function.

news.pred.rts = pred.word.fun(gramlist = qblist.3gram, data.input = df.news )
blog.pred.rts = pred.word.fun(gramlist = qblist.3gram, data.input = df.blogs )

create two dataframe with the match information for the quiz question with different source

pred.rts
##             cgramText cgramCount  nw1Text nw1Count nw2Text nw2Count
## 1           a case of        149      the       57    beer        7
## 2      would mean the        171    world      151   WORLD        4
## 3         make me the         44 happiest       24    most        3
## 4  struggling but the          0     <NA>       NA    <NA>       NA
## 5         date at the         12     "art        1     App        1
## 6            be on my        144      way       24    show        6
## 7       in quite some          5    time.        3   time!        1
## 8     with his little          6    bitty        1 brother        1
## 9    faith during the          0     <NA>       NA    <NA>       NA
## 10        you must be        218        a       27      so       15
##     nw3Text nw3Count
## 1    Miller        3
## 2    world!        3
## 3      16th        2
## 4      <NA>       NA
## 5    bottom        1
## 6       own        5
## 7    time!!        1
## 8  brother.        1
## 9      <NA>       NA
## 10       on        8
news.pred.rts
##             cgramText cgramCount   nw1Text nw1Count nw2Text nw2Count
## 1           a case of        134  mistaken        8       a        7
## 2      would mean the         30      loss        3     end        2
## 3         make me the          5   coolest        1    face        1
## 4  struggling but the          0      <NA>       NA    <NA>       NA
## 5         date at the         16      time        2 Beverly        1
## 6            be on my         14      mind        2    back        1
## 7       in quite some          3     time.        2    <NA>        1
## 8     with his little          6 brother,"        1   girl.        1
## 9    faith during the          1      last        1    <NA>       NA
## 10        you must be         23         a        5      14        1
##     nw3Text nw3Count
## 1       the        6
## 2  accident        1
## 3  happiest        1
## 4      <NA>       NA
## 5      chip        1
## 6      bike        1
## 7      <NA>       NA
## 8   sister,        1
## 9      <NA>       NA
## 10       21        1
blog.pred.rts
##             cgramText cgramCount  nw1Text nw1Count nw2Text nw2Count
## 1           a case of        245      the       31    beer        5
## 2      would mean the         10    world        2      £1        1
## 3         make me the         17     best        2    bun,        1
## 4  struggling but the          0     <NA>       NA    <NA>       NA
## 5         date at the         11      end        2    Cake        1
## 6            be on my         69      own        7    mind        6
## 7       in quite some         22    time.       12   time,        7
## 8     with his little         24 bedroom.        1 brother        1
## 9    faith during the          1  worship        1    <NA>       NA
## 10        you must be        118        a       18    able       10
##     nw3Text nw3Count
## 1        24        3
## 2    angles        1
## 3  daughter        1
## 4      <NA>       NA
## 5  Driskill        1
## 6       way        6
## 7      time        2
## 8  brother.        1
## 9      <NA>       NA
## 10      the        5