The goal of this report is to explain only the major features of the data I have identified and briefly summarize plans for creating the prediction algorithm and Shiny app for the Capstone Project.
library(dplyr)
library(ggplot2)
library(stringr)
library(stringi)
library(tm)
library(NLP)
library(qdap)
library(RWeka)
library(ngram)
blog <- file("blog.txt", "r")
lines1<-readLines(blog)
close(blog)
twitter <- file("en_US.twitter.txt", "r")
lines2<-readLines(twitter, skipNul = TRUE)
close(twitter)
news<- file("en_US.news.txt", "rb")
lines3<-readLines(news)
close(news)
lenblog<-length(lines1)
lentwitter<-length(lines2)
lennews<-length(lines3)
wblog<-sum(str_count(lines1))
wtwitter<-sum(str_count(lines2))
wnews<-sum(str_count(lines3))
size_blogs <- file.info("en_US/en_US.blogs.txt")$size / 1024^2
size_news <- file.info("en_US/en_US.news.txt")$size / 1024^2
size_twitter <- file.info("en_US/en_US.twitter.txt")$size / 1024^2
tabled_data<-data.frame(file=c("en_US.blogs.txt", "en_US.twitter.txt", "en_US.news.txt"),
lines.count=c(lenblog,lentwitter,lennews),
word.count=c(wblog,wtwitter,wnews),
size=c(size_blogs,size_news,size_twitter))
tabled_data
## file lines.count word.count size
## 1 en_US.blogs.txt 77259 15683765 NA
## 2 en_US.twitter.txt 2360148 162385035 NA
## 3 en_US.news.txt 1010242 203791405 NA
Due to my laptops memory limits the analysis is performed by using limited data from the dataset
set.seed(12345)
blogs <-iconv(lines1,"latin1","ASCII",sub="")
news <-iconv(lines2,"latin1","ASCII",sub="")
twitter <-iconv(lines3,"latin1","ASCII",sub="")
sample_data <-c(sample(blogs,length(blogs)*0.005),
sample(news,length(news)*0.005),
sample(twitter,length(twitter)*0.005))
Cleaning process done where - lowercase letters (Example : “Can” –> “can”) - replace contractions (Example : “doesn’t” –> “does not”) - remove punctuation - remove numbers and all non letters characters - handling apostrophes (“’”) - remove common redundant words (example “a”, “in”, “and”) - remove un-necessary spaces
corpus <- VCorpus(VectorSource(sample_data))
corpus1 <- tm_map(corpus,removePunctuation)
corpus2 <- tm_map(corpus1,stripWhitespace)
corpus3 <- tm_map(corpus2,tolower) # Convert to lowercase
corpus4 <- tm_map(corpus3,removeNumbers)
corpus5 <- tm_map(corpus4,PlainTextDocument)
#removing stop words in English (a, as, at, so, etc.)
corpus6 <- tm_map(corpus5,removeWords,stopwords("english"))
In NLP n-gram is a contiguous sequence of n items from a given sequence of text or speech. Unigrams are single words. Bigrams are two words combinations. Trigrams are three-word combinations.
The following function is used to extract 1-grams, 2-grams, 3-grams from the text Corpus using RWeka.
one<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
two<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
thr<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
one_table<-TermDocumentMatrix(corpus6,control=list(tokenize=one))
two_table<-TermDocumentMatrix(corpus6,control=list(tokenize=two))
thr_table<-TermDocumentMatrix(corpus6,control=list(tokenize=thr))
#Finding the frequency of terms in each of these 3 matrices and construct dataframes of these frequencies.
one_corpus<-findFreqTerms(one_table,lowfreq=1000)
two_corpus<-findFreqTerms(two_table,lowfreq=80)
thr_corpus<-findFreqTerms(thr_table,lowfreq=10)
one_corpus_num<-rowSums(as.matrix(one_table[one_corpus,]))
one_corpus_table<-data.frame(Word=names(one_corpus_num),frequency=one_corpus_num)
one_corpus_sort<-one_corpus_table[order(-one_corpus_table$frequency),]
head(one_corpus_sort)
## Word frequency
## said said 1453
## will will 1040
## just just 1037
two_corpus_num<-rowSums(as.matrix(two_table[two_corpus,]))
two_corpus_table<-data.frame(Word=names(two_corpus_num),frequency=two_corpus_num)
two_corpus_sort<-two_corpus_table[order(-two_corpus_table$frequency),]
head(two_corpus_sort)
## Word frequency
## right now right now 101
## last year last year 80
thr_corpus_num<-rowSums(as.matrix(thr_table[thr_corpus,]))
thr_corpus_table<-data.frame(Word=names(thr_corpus_num),frequency=thr_corpus_num)
thr_corpus_sort<-thr_corpus_table[order(-thr_corpus_table$frequency),]
head(thr_corpus_sort)
## Word frequency
## cant wait see cant wait see 16
## happy mothers day happy mothers day 16
## let us know let us know 11
Now we will see the results of the data analysis which took place in the form of graphs and check whether our analysis was good and can be used to fit in a model.
one_g<-ggplot(one_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
one_g<-one_g+geom_bar(stat="identity")
one_g<-one_g+labs(title="Unigrams",x="Words",y="Frequency")
one_g<-one_g+theme(axis.text.x=element_text(angle=90))
one_g
two_g<-ggplot(two_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
two_g<-two_g+geom_bar(stat="identity")
two_g<-two_g+labs(title="Bigrams",x="Words",y="Frequency")
two_g<-two_g+theme(axis.text.x=element_text(angle=90))
two_g
thr_g<-ggplot(thr_corpus_sort[1:10,],aes(x=reorder(Word,-frequency),y=frequency,fill=frequency))
thr_g<-thr_g+geom_bar(stat="identity")
thr_g<-thr_g+labs(title="Trigrams",x="Words",y="Frequency")
thr_g<-thr_g+theme(axis.text.x=element_text(angle=90))
thr_g
1.The prediction model would have to use these frequencies. In the app, as long as the user is typing, the app should respond with suggestions of the words that match all of part of the string already typed, with ordred with descending frequency.
2.When a user types a space (meaning that he finished his first word), the app should respond with the possible, also ordred with descending frequency.