The goal of this project is to build a first simple model for the relationship between words. This is the first step in building a predictive text mining application. We will explore simple models and discover more complicated modeling techniques.
library(tm)
library(stringr)
library(stringi)
library(ngram)
library(ggplot2)
blogsdata1<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.blogs.txt", open="rb")
twitterdata1<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.twitter.txt", open="rb")
newsdata1<-file("C:/Users/nirma/Documents/Coursera/Data Science/Data Science Capstone/final/en_US/en_US.news.txt", open="rb")
blogsdata<-readLines(blogsdata1)
twitterdata<-readLines(twitterdata1)
newsdata<-readLines(newsdata1)
summary(blogsdata)
## Length Class Mode
## 899288 character character
summary(twitterdata)
## Length Class Mode
## 2360148 character character
summary(newsdata)
## Length Class Mode
## 1010242 character character
i. Setting the Corpus by tidying up the data, i.e., getting rid of punctuation marks, functional words, numbers, and the unnecessary white spaces.
cleanfunction<-function(n){
corpus<-Corpus(VectorSource(n))
corpus<-tm_map(corpus, content_transformer(tolower))
corpus<-tm_map(corpus, removePunctuation)
corpus<-tm_map(corpus, removeWords, stopwords("english"))
corpus<-tm_map(corpus, removeNumbers)
corpus<-tm_map(corpus, stripWhitespace)
corpus<-tm_map(corpus, PlainTextDocument)
corpus<-Corpus(VectorSource(corpus))
}
ii. Creating Wordcount Function to Embed in my Predication Model
totalwords<-function (n){
nirmal<-DocumentTermMatrix(n)
nirmal_matrix<-as.matrix(nirmal)
frequencyOfWords<-colSums(nirmal_matrix)
frequencyOfWords<-sort(frequencyOfWords, decreasing = TRUE)
Word<-names(frequencyOfWords)
return(list(Word, frequencyOfWords))
}
iii. Predicting Next Word:the Next Word Function
YourNextWord<-function(u,v){
blog_quest<-grepl(u, blogsdata, ignore.case=TRUE)
blog_doc<-blogsdata[blog_quest]
text_char<-'a'
YourNextWord<-'a'
i<-length(blog_doc)
if(i>0){
for (i in 1:i){
text_char[i]<-str_extract(blog_doc[i],v)
YourNextWord[i]<-stri_extract_last_words(text_char[i])
}
}
news_quest<-grepl(u, newsdata, ignore.case=TRUE)
news_doc<-newsdata[news_quest]
j=length(news_doc)
if (j>0){
for( j in 1:j){
text_char[i+j]<-str_extract(news_doc[j],v)
YourNextWord[i+j]<-stri_extract_last_words(text_char[i+j])
}
}
twitter_quest<-grepl(u, twitterdata, ignore.case=TRUE)
twitter_doc<-twitterdata[twitter_quest]
k=length(twitter_doc)
if (k>0){
for( k in 1:k){
text_char[i+j+k]<-str_extract(twitter_doc[k], v)
YourNextWord[i+j+k]<-stri_extract_last_words(text_char[i+j+k])
}
}
creatingtable<-as.data.frame(YourNextWord, stringAsFactors=FALSE)
summary(creatingtable)
token_blog<- cleanfunction(creatingtable)
word_blog<- totalwords(token_blog)
summary(nchar(creatingtable))
head(creatingtable)
TDMatrix_blog<-TermDocumentMatrix(token_blog)
m_blog<-as.matrix(TDMatrix_blog)
sort_blog<-sort(rowSums(m_blog),decreasing=TRUE)
dataF_blog<-data.frame(word=names(sort_blog),freq=sort_blog)
head(sort_blog,100)
return(list(head(sort_blog,100)))
}
Let’s check if the above functions work: Sentence 1
sentence1<-YourNextWord("a case of", "([Aa]+ + [Cc]ase+ + [Oo]f+ + [^ ]+ )")
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(corpus, PlainTextDocument): transformation drops
## documents
sentence1
## [[1]]
## character(0), 19, 49, "cna
## 5 2 2 1
## 0), 121, 28.9809420108795, character(0)))
## 1 1 1 1
## datetimestamp description heading hour
## 1 1 1 1
## isdst language list(author list(content
## 1 1 1 1
## list(sec mday meta min
## 1 1 1 1
## mon na", origin wday
## 1 1 1 1
## yday year "en") list(language
## 1 1 1 1
## list()
## 1
As you develop your prediction model, two key aspects that you will have to keep in mind are the size and runtime of the algorithm. These are defined as:
Your goal for this prediction model is to minimize both the size and runtime of the model in order to provide a reasonable experience to the user.
Keep in mind that currently available predictive text models can run on mobile phones, which typically have limited memory and processing power compared to desktop computers. Therefore, you should consider very carefully
Here are a few tools that may be of use to you as you work on their algorithm:
There will likely be a tradeoff that you have to make in between size and runtime. For example, an algorithm that requires a lot of memory, may run faster, while a slower algorithm may require less memory. You will have to find the right balance between the two in order to provide a good experience to the user.